In [4]:
import pandas as pd
%matplotlib inline

In [9]:
# ! pip install --upgrade scikit-lego
from sklego.datasets import load_chicken

In [13]:
chickweight = load_chicken(give_pandas=True)

# Data Wrangling

We load a famous data set called chickweight which contains the weight of chickens that follow different diets.

Below we show some example code and it's your task to guess the output before running the cell.

In [14]:
url = 'http://koaning.io/old/theme/data/chickweight.csv'
chickweight = pd.read_csv(url)

In [20]:
chickweight.columns = [c.lower() for c in chickweight.columns]

(chickweight
 .head()
 .tail(2))

Unnamed: 0,rownum,weight,time,chick,diet
3,4,64,6,1,1
4,5,76,8,1,1


# Assignment

### What could we do? 

Imagine that we are a farm and we have the following dataset available; what would we do with the dataset?

In [21]:
# chickweight

In [22]:
# chickweight.head()

In [5]:
# solution 

# 1. we might discover which diet is best for our chickens 
# 2. we might estimate how big the chickens get if we delay the slaughter date 
# 3. we might learn if chickens grow quicker during certain weeks 

The main usecase we will want to focus on is to figure out which diet is best, but it is good to take a moment to think if there are also other usecases.

## Verbs on a Dataframe 

The goal of this notebook is to show and demonstrate all the "verbs" on a dataframe. If a dataframe is a noun then verbs are actions that can be performed. Typically a dataframe needs to be able to: 

1. select the columns 
2. select the rows 
3. rename the columns 
4. sort the rows 
5. summarise statistics 
6. create new columns 

Whatever analysis that you are doing, about 80% of them can be described with these "verbs". In this notebook we will demonstrate these commands on the chickweight dataframe such that we can later use it to do analysis.

## Guessing Game 

In order to get familiar with all possible commands, try to run the following cells and try to predict what is happening before you run it. The goal of the document is that you understand how to translate a data operation you have in your head into python code.

In [31]:
(
    chickweight
    [['weight', 'time']]
    .head()
)

Unnamed: 0,weight,time
0,42,0
1,51,2
2,59,4
3,64,6
4,76,8


In [48]:
(
    chickweight
    [['weight', 'time', 'diet']]
    .loc[lambda d: d['diet'] == 1]
    .head(10)
)

Unnamed: 0,weight,time,diet
0,42,0,1
1,51,2,1
2,59,4,1
3,64,6,1
4,76,8,1
5,93,10,1
6,106,12,1
7,125,14,1
8,149,16,1
9,171,18,1


In [44]:
(
    chickweight
    .drop(columns=['chick'])
    .loc[lambda x: (x['weight'] > 50) & (x['weight'] < 100)]
    .head()
)

Unnamed: 0,rownum,weight,time,diet
1,2,51,2,1
2,3,59,4,1
3,4,64,6,1
4,5,76,8,1
5,6,93,10,1


In [49]:
(
    chickweight
    .loc[lambda x: x['time'] <= 12]
    .describe()
)

Unnamed: 0,rownum,weight,time,chick,diet
count,345.0,345.0,345.0,345.0,345.0
mean,285.556522,78.785507,5.971014,25.608696,2.217391
std,166.428114,34.468309,4.012955,14.527657,1.167256
min,1.0,35.0,0.0,1.0,1.0
25%,146.0,51.0,2.0,13.0,1.0
50%,283.0,68.0,6.0,26.0,2.0
75%,429.0,101.0,10.0,38.0,3.0
max,573.0,217.0,12.0,50.0,4.0


In [52]:
(
    chickweight
    .sort_values(['time', 'diet'], ascending=[False, True])
    .head()
)

Unnamed: 0,rownum,weight,time,chick,diet
11,12,205,21,1,1
23,24,215,21,2,1
35,36,202,21,3,1
47,48,157,21,4,1
59,60,223,21,5,1


In [100]:
chickweight.head()

Unnamed: 0,rownum,weight,time,chick,diet
0,1,42,0,1,1
1,2,51,2,1,1
2,3,59,4,1,1
3,4,64,6,1,1
4,5,76,8,1,1


# Assignments 

```python
url = 'http://koaning.io/old/theme/data/chickweight.csv'
chickweight = pd.read_csv(url)
chickweight.columns = [c.lower() for c in chickweight.columns]
```

1. Find the heaviest chicken. 
2. Which diet has the best average weight at timestep 12? 
3. Find the lightest chicken. 
4. What's the average growth per timestep. Hint; google what `column.shift()` does. 

In [152]:
(chickweight
 .groupby('time')
 .apply(lambda d: pd.Series({
     'mean_weight': np.mean(d['weight'])
 }))
 .reset_index()
 .assign(prev_weight = lambda d: d['mean_weight'].shift(1))
 .assign(difference = lambda d: d['mean_weight'] - d['prev_weight'])
 .assign(diff = lambda d: d['mean_weight'].diff())
 .assign(diff = lambda d: np.where(np.isnan(d['diff']), 0, d['diff'])))

Unnamed: 0,time,mean_weight,prev_weight,difference,diff
0,0,41.06,,,0.0
1,2,49.22,41.06,8.16,8.16
2,4,59.959184,49.22,10.739184,10.739184
3,6,74.306122,59.959184,14.346939,14.346939
4,8,91.244898,74.306122,16.938776,16.938776
5,10,107.836735,91.244898,16.591837,16.591837
6,12,129.244898,107.836735,21.408163,21.408163
7,14,143.8125,129.244898,14.567602,14.567602
8,16,168.085106,143.8125,24.272606,24.272606
9,18,190.191489,168.085106,22.106383,22.106383


In [136]:
chickweight.sort_values('weight', ascending=False).head(1)

Unnamed: 0,rownum,weight,time,chick,diet
399,400,373,21,35,3


In [137]:
chickweight.loc[lambda d: d['weight'] == np.max(d['weight'])]

Unnamed: 0,rownum,weight,time,chick,diet
399,400,373,21,35,3


In [133]:
(chickweight
 .loc[lambda d: d['time'] == 12]
 .groupby(['diet'])
 .apply(lambda d: pd.Series({
     "mean_weight": d['weight'].mean()
 }))
)

Unnamed: 0_level_0,mean_weight
diet,Unnamed: 1_level_1
1,108.526316
2,131.3
3,144.4
4,151.4


In [139]:
(chickweight
 .groupby(['chick'])
 .apply(lambda d: pd.Series({
     "max_weight": d['weight'].max()
 }))
 .sort_values("max_weight")
 .head(1)
)

Unnamed: 0_level_0,max_weight
chick,Unnamed: 1_level_1
18,39


In [153]:
def aggregate(dataf):
    return (dataf
            .groupby(['diet', 'time'])
            .apply(lambda d: pd.Series({
                "mean_weight": np.mean(d['weight']), 
                "var_weight": np.var(d['weight']),
                "min_weight": np.min(d['weight'])
            }))
            .reset_index())

In [154]:
clean_df = (chickweight
 .pipe(aggregate)
 .merge(chickweight, on=['diet', 'time'])
 .assign(diff_weight = lambda d: d['mean_weight'] - d['weight'])
 .loc[lambda d: d['time'] == 0])

In [108]:
clean_df.to_csv("path/to/save.csv", index=False)

In [None]:
(
    chickweight
    .head()
    .sort_values('weight', ascending=False)
)

In [None]:
(
    chickweight
    .agg({'weight': [len, 'mean'],
          'diet': lambda x: x.unique()
         })
)

In [None]:
(
    chickweight
    .groupby(['diet', 'time'])
    .apply(lambda x: pd.Series({
        'number_rows': len(x),
        'weight_mean': x['weight'].mean()
    }))
)

In [None]:
(
    chickweight
    .groupby('time')
    .apply(lambda x: pd.Series({
        'number_rows': len(x),
        'weight_mean': x['weight'].mean(),
        'weight_variance': x['weight'].var()
    }))
)

In [None]:
(
    chickweight
    .assign(weight2 = lambda x: x['weight'] * 2)
    .head()
)

In [None]:
(
    chickweight
    .assign(weight = lambda x: x['weight'] * 2)
    .head()
)

In [None]:
(
    chickweight
    .assign(r = lambda x: x.groupby('chick').cumcount()+1)
    .loc[lambda x: x['r'] < 5]
    [['time', 'chick', 'r']]
    .head(11)
)

In [None]:
(
    chickweight
    [['chick', 'diet']]
    .drop_duplicates()
)

In [None]:
(
    chickweight
    .sample(3)
)

In [None]:
(
    chickweight
    .sample(3)
)

# Assignment

## 1. Find the fattest chicken per diet

Hint: use `groupby`

In [None]:
%load answers/fattest_chicken.py

## 2. Find the dead chickens

Hint: use `describe` to find some clues and use `groupby` to get your answer

Can you also find which diet were they on?

In [None]:
%load answers/dead_chickens.py