# Dataframe Manipulation Warmup

In [178]:
import numpy as np
import pandas as pd
import seaborn as sns

np.random.seed(406)

n = 5000
df = pd.DataFrame({
    'favorite_animal': np.random.choice(['cat', 'dog', 'frog', 'lemur', 'panda'], n),
    'favorite_vegetable': np.random.choice(['brussel sprouts', 'potato', 'squash'], n),
    'favorite_fruit': np.random.choice(['banana', 'apple', 'blueberries'], n),
    'wears_glasses': np.random.choice(['yes', 'no'], n),
    'netflix_consumption': np.random.normal(10, 2, n),
    'open_browser_tabs': np.random.randint(2, 90, n),
})

In [56]:
df.head()

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs
0,lemur,potato,apple,yes,8.313351,44
1,panda,potato,apple,yes,11.801073,10
2,cat,squash,blueberries,yes,10.105141,35
3,lemur,squash,apple,no,11.024605,70
4,dog,brussel sprouts,apple,yes,6.732698,73


- What is the highest amount of netflix consumption? `17.535`

In [57]:
df.netflix_consumption.round(3).max()

17.535

- How many people wear glasses? What percentage of people is this? `2555`, `.511`

In [58]:
(df['wears_glasses'] == 'yes').sum()

2555

In [18]:
(df['wears_glasses'] == 'yes').sum() / df.shape[0]  

0.511

- How many people's favorite animal is a dog? `1002`

In [59]:
(df.favorite_animal == 'dog').sum()

1002

- What is the most common favorite animal? `lemur`

In [60]:
df.favorite_animal.mode()

0    lemur
dtype: object

- What is the average netflix consumption for people that prefer brussel
  sprouts? `10.008`

In [61]:
df.head()

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs
0,lemur,potato,apple,yes,8.313351,44
1,panda,potato,apple,yes,11.801073,10
2,cat,squash,blueberries,yes,10.105141,35
3,lemur,squash,apple,no,11.024605,70
4,dog,brussel sprouts,apple,yes,6.732698,73


In [62]:
df[df["favorite_vegetable"] == 'brussel sprouts'].mean()[0].round(3)

10.008

- What is the most common favorite fruit for people who wear glasses and have
  more than 40 open browser tabs? `blueberries`

In [68]:
filter1 = df['wears_glasses'] == 'yes'
filter2 = df['open_browser_tabs'] > 40


df2 = df.where(filter1 & filter2, inplace=True)
df.favorite_fruit.mode()

0    blueberries
dtype: object

In [71]:
df

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs
0,lemur,potato,apple,yes,8.313351,44
1,panda,potato,apple,yes,11.801073,10
2,cat,squash,blueberries,yes,10.105141,35
3,lemur,squash,apple,no,11.024605,70
4,dog,brussel sprouts,apple,yes,6.732698,73
...,...,...,...,...,...,...
4995,frog,potato,blueberries,yes,9.992337,59
4996,frog,potato,apple,yes,8.916902,46
4997,lemur,squash,banana,no,7.317019,70
4998,lemur,squash,blueberries,yes,12.720892,50


In [74]:
(df['netflix_consumption'] < 7).sum() / df.shape[0]

0.0716

- What is the average netflix consumption for people with less than 30 open
  browser tabs? `9.91935`

In [202]:
df[df['open_browser_tabs'] < 30].netflix_consumption.mean()

9.91935736918227

- How many people *don't* wear glasses, have a favorite animal of a panda, have
  a favorite fruit of blueberries, and have more than 60 open browser tabs? What
  is the median netflix consumption for this group? What is the most common
  favorite vegetable for this group? `46`, `10.455`, `potato`

In [206]:
df2 = df[df['wears_glasses'] == 'no'] 
df2 = df2[df2['favorite_animal'] == 'panda']
df2 = df2[df2['favorite_fruit'] == 'blueberries']
df2 = df2[df2['open_browser_tabs'] > 60]


df2.count()[0], df2['netflix_consumption'].round(decimals=3).median(), df2['favorite_vegetable'].mode()[0]



(46, 10.455, 'potato')

In [209]:
df2.head()

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs
215,panda,potato,blueberries,no,11.948934,88
440,panda,potato,blueberries,no,13.016554,89
631,panda,potato,blueberries,no,10.521033,83
775,panda,brussel sprouts,blueberries,no,7.9566,76
895,panda,squash,blueberries,no,10.788124,84


- What is the least popular combination of favorite fruit and vegetable? `apple` and `potato`

In [144]:
df.groupby(['favorite_fruit', 'favorite_vegetable']).size().sort_values(ascending=True)

favorite_fruit  favorite_vegetable
apple           potato                512
banana          squash                524
apple           squash                555
blueberries     brussel sprouts       555
                potato                560
apple           brussel sprouts       565
banana          potato                570
                brussel sprouts       576
blueberries     squash                583
dtype: int64

- Which combination of favorite animal and wearing glasses has the highest average
  netflix consumption? people that wear glasses and prefer pandas

In [148]:
df.head()

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs
0,lemur,potato,apple,yes,8.313351,44
1,panda,potato,apple,yes,11.801073,10
2,cat,squash,blueberries,yes,10.105141,35
3,lemur,squash,apple,no,11.024605,70
4,dog,brussel sprouts,apple,yes,6.732698,73


In [199]:



df.groupby(['favorite_animal','wears_glasses']).mean().sort_values(by='netflix_consumption', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,netflix_consumption,open_browser_tabs
favorite_animal,wears_glasses,Unnamed: 2_level_1,Unnamed: 3_level_1
panda,yes,10.092273,46.180361
dog,yes,10.087352,46.953125
lemur,no,10.024557,47.372943
lemur,yes,10.010196,46.677755
frog,no,9.962311,45.375271
panda,no,9.946293,44.665966
dog,no,9.933246,46.593878
cat,yes,9.884685,44.97053
cat,no,9.846183,44.210191
frog,yes,9.83474,46.16787


- **Bonus**: for each of the above questions, what kind of visualization would
  be the most effective in conveying your answer?