# Grouping data

In [48]:
import pandas as pd
from scipy.stats import zscore

### Grouping by multiple columns

In [4]:
titanic = pd.read_csv('titanic.csv')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
pclass       1309 non-null int64
survived     1309 non-null int64
name         1309 non-null object
sex          1309 non-null object
age          1046 non-null float64
sibsp        1309 non-null int64
parch        1309 non-null int64
ticket       1309 non-null object
fare         1308 non-null float64
cabin        295 non-null object
embarked     1307 non-null object
boat         486 non-null object
body         121 non-null float64
home.dest    745 non-null object
dtypes: float64(3), int64(4), object(7)
memory usage: 143.2+ KB


In [13]:
by_class = titanic.groupby('pclass')
count_by_class = by_class.survived.count()
count_by_class

pclass
1    323
2    277
3    709
Name: survived, dtype: int64

In [14]:
by_port_class = titanic.groupby(['embarked','pclass'])
count_p_c = by_port_class.survived.count()
count_p_c

embarked  pclass
C         1         141
          2          28
          3         101
Q         1           3
          2           7
          3         113
S         1         177
          2         242
          3         495
Name: survived, dtype: int64

### Grouping by another series

In [15]:
life = pd.read_csv('life_expectancy.csv',index_col='Country')
regions = pd.read_csv('regions.csv', index_col='Country')

In [17]:
regions.head()

Unnamed: 0_level_0,region
Country,Unnamed: 1_level_1
Afghanistan,South Asia
Albania,Europe & Central Asia
Algeria,Middle East & North Africa
Angola,Sub-Saharan Africa
Antigua and Barbuda,America


In [18]:
life.head()

Unnamed: 0_level_0,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,33.639,34.152,34.662,35.17,35.674,36.172,36.663,37.143,37.614,38.075,...,56.583,57.071,57.582,58.102,58.618,59.124,59.612,60.079,60.524,60.947
Albania,65.475,65.863,66.122,66.316,66.5,66.702,66.948,67.251,67.595,67.966,...,75.725,75.949,76.124,76.278,76.433,76.598,76.78,76.979,77.185,77.392
Algeria,47.953,48.389,48.806,49.205,49.592,49.976,50.366,50.767,51.195,51.67,...,69.682,69.854,70.02,70.18,70.332,70.477,70.615,70.747,70.874,71.0
Angola,34.604,35.007,35.41,35.816,36.222,36.627,37.032,37.439,37.846,38.247,...,48.036,48.572,49.041,49.471,49.882,50.286,50.689,51.094,51.498,51.899
Antigua and Barbuda,63.775,64.149,64.511,64.865,65.213,65.558,65.898,66.232,66.558,66.875,...,74.355,74.544,74.729,74.91,75.087,75.263,75.437,75.61,75.783,75.954


In [24]:
life_exp_by_region = life.groupby(regions['region'])
life_exp_by_region['2010'].mean()

region
America                       74.037350
East Asia & Pacific           73.405750
Europe & Central Asia         75.656387
Middle East & North Africa    72.805333
South Asia                    68.189750
Sub-Saharan Africa            57.575080
Name: 2010, dtype: float64

### Computing multiple aggregates of multiple columns
- groupby 'pclass'
- select 'age' and 'fare'
- aggregate by 'max' and 'median'
- get maximum age in each class
- get median fare in each class

In [27]:
by_class = titanic.groupby('pclass')
by_class_sub = by_class[['age', 'fare']]
aggregated = by_class_sub.agg(['max','median'])
aggregated

Unnamed: 0_level_0,age,age,fare,fare
Unnamed: 0_level_1,max,median,max,median
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,80.0,39.0,512.3292,60.0
2,70.0,29.0,73.5,15.0458
3,74.0,24.0,69.55,8.05


In [29]:
aggregated.loc[:,('age','max')]

pclass
1    80.0
2    70.0
3    74.0
Name: (age, max), dtype: float64

In [30]:
aggregated.loc[:,('fare','median')]

pclass
1    60.0000
2    15.0458
3     8.0500
Name: (fare, median), dtype: float64

### Aggregating on index levels/fields
- group by year and region
- define function to compute spread
- create dict for aggregation
- aggregate and print

In [31]:
gapminder = pd.read_csv('gapminder_tidy.csv', index_col=['Year', 'region', 'Country']).sort_index()
gapminder.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fertility,life,population,child_mortality,gdp
Year,region,Country,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1964,America,Antigua and Barbuda,4.25,63.775,58653.0,72.78,5008.0
1964,America,Argentina,3.068,65.388,21966478.0,57.43,8227.0
1964,America,Aruba,4.059,67.113,57031.0,,5505.0
1964,America,Bahamas,4.22,64.189,133709.0,48.56,18160.0
1964,America,Barbados,4.094,62.819,234455.0,64.7,5681.0


In [38]:
by_year_region = gapminder.groupby(['Year', 'region'])

In [39]:
def spread(series):
    return series.max() - series.min()

In [40]:
aggregator = {
    'population':'sum',
    'child_mortality':'mean',
    'gdp':spread
}

In [41]:
aggregated = by_year_region.agg(aggregator)
aggregated.tail(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,population,child_mortality,gdp
Year,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,America,962908700.0,17.745833,49634.0
2013,East Asia & Pacific,2244209000.0,22.285714,134744.0
2013,Europe & Central Asia,896878800.0,9.831875,86418.0
2013,Middle East & North Africa,403050400.0,20.2215,128676.0
2013,South Asia,1701241000.0,46.2875,11469.0
2013,Sub-Saharan Africa,920599600.0,76.94449,32035.0


### Grouping on a function of the index

In [45]:
sales = pd.read_csv('sales/sales-feb-2015.csv', parse_dates=True, index_col='Date')
sales.head()

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-02 08:30:00,Hooli,Software,3
2015-02-02 21:00:00,Mediacore,Hardware,9
2015-02-03 14:00:00,Initech,Software,13
2015-02-04 15:30:00,Streeplex,Software,13
2015-02-04 22:00:00,Acme Coporation,Hardware,14


In [47]:
by_day = sales.groupby(sales.index.strftime('%a'))
units_sum = by_day['Units'].sum()
units_sum

Mon    48
Sat     7
Thu    59
Tue    13
Wed    48
Name: Units, dtype: int64

### Detecting outliers with Z-Scores
- group by region and transform life and fertility by zscore
- construct boolean series where life < -3 and fertility > 3 from zscore
- filter gapminder

In [61]:
gapminder = pd.read_csv('gapminder_tidy.csv', index_col='Country')
gapminder_2010 = gapminder[gapminder['Year']==2010]
del gapminder_2010['Year']
gapminder_2010.info()

<class 'pandas.core.frame.DataFrame'>
Index: 202 entries, Afghanistan to Zimbabwe
Data columns (total 6 columns):
fertility          202 non-null float64
life               202 non-null float64
population         202 non-null float64
child_mortality    189 non-null float64
gdp                180 non-null float64
region             202 non-null object
dtypes: float64(5), object(1)
memory usage: 11.0+ KB


In [66]:
standardized = gapminder_2010.groupby('region')[['life','fertility']].transform(zscore)
standardized.head()

Unnamed: 0_level_0,life,fertility
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,-1.743601,2.504732
Albania,0.226367,0.010964
Algeria,-0.440196,-0.003972
Angola,-0.882537,1.095653
Antigua and Barbuda,0.240607,-0.363761


In [68]:
outliers = (standardized['life'] < -3) | (standardized['fertility'] > 3)
gapminder_outliers_2010 = gapminder_2010.loc[outliers]
gapminder_outliers_2010

Unnamed: 0_level_0,fertility,life,population,child_mortality,gdp,region
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Guatemala,3.974,71.1,14388929.0,34.5,6849.0,America
Haiti,3.35,45.0,9993247.0,208.8,1518.0,America
Tajikistan,3.78,66.83,6878637.0,52.6,2110.0,Europe & Central Asia
Timor-Leste,6.237,65.952,1124355.0,63.8,1777.0,East Asia & Pacific


### Filling missing data (imputation) by group
- Group titanic by 'sex' and 'pclass'
- Write a function called impute_median() that fills missing values with the median of a series.
- Call .transform() with impute_median on the 'age' column of by_sex_class

In [72]:
titanic.tail()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1304,3,0,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0,0,0,2670,7.225,,C,,,
1308,3,0,"Zimmerman, Mr. Leo",male,29.0,0,0,315082,7.875,,S,,,


In [70]:
by_sex_class = titanic.groupby(['sex', 'pclass'])

In [73]:
def impute_median(series):
    return series.fillna(series.median())

In [74]:
titanic.age = by_sex_class['age'].transform(impute_median)

In [75]:
titanic.tail()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1304,3,0,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,22.0,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0,0,0,2670,7.225,,C,,,
1308,3,0,"Zimmerman, Mr. Leo",male,29.0,0,0,315082,7.875,,S,,,


### Other transformations with .apply
- Group gapminder_2010 by 'region'
- Apply disparity function
- Use .loc[] to select ['United States','United Kingdom','China']

In [76]:
def disparity(gr):
    # Compute the spread of gr['gdp']: s
    s = gr['gdp'].max() - gr['gdp'].min()
    # Compute the z-score of gr['gdp'] as (gr['gdp']-gr['gdp'].mean())/gr['gdp'].std(): z
    z = (gr['gdp'] - gr['gdp'].mean())/gr['gdp'].std()
    # Return a DataFrame with the inputs {'z(gdp)':z, 'regional spread(gdp)':s}
    return pd.DataFrame({'z(gdp)':z , 'regional spread(gdp)':s})

In [77]:
regional = gapminder_2010.groupby('region')

In [80]:
regional_disparity = regional.apply(disparity)

In [81]:
regional_disparity.loc[['United States','United Kingdom','China']]

Unnamed: 0_level_0,z(gdp),regional spread(gdp)
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,3.013374,47855.0
United Kingdom,0.572873,89037.0
China,-0.432756,96993.0


### Grouping and filtering with .apply()
- Group titanic by 'sex'
- Apply c_deck_survival function on the by_sex DataFrame

In [84]:
def c_deck_survival(gr):
    c_passengers = gr['cabin'].str.startswith('C').fillna(False)
    return gr.loc[c_passengers, 'survived'].mean()

In [86]:
by_sex = titanic.groupby('sex')
c_surv_by_sex = by_sex.apply(c_deck_survival)
c_surv_by_sex

sex
female    0.913043
male      0.312500
dtype: float64

In [90]:
titanic.loc[titanic['cabin'].str.startswith('C').fillna(False), 'survived'].mean()

0.6063829787234043

### Grouping and filtering with .filter()
- Group sales by 'Company'
- Compute and print the sum of the 'Units'
- Call `.filter()` with `lambda g:g['Units'].sum() > 35`

In [95]:
by_company = sales.groupby('Company')
sum_by_company = by_company['Units'].sum()
sum_by_company

Company
Acme Coporation    34
Hooli              30
Initech            30
Mediacore          45
Streeplex          36
Name: Units, dtype: int64

In [98]:
filter_company = by_company.filter(lambda g: g['Units'].sum()>35)
filter_company

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-02 21:00:00,Mediacore,Hardware,9
2015-02-04 15:30:00,Streeplex,Software,13
2015-02-09 09:00:00,Streeplex,Service,19
2015-02-09 13:00:00,Mediacore,Software,7
2015-02-19 11:00:00,Mediacore,Hardware,16
2015-02-19 16:00:00,Mediacore,Service,10
2015-02-21 05:00:00,Mediacore,Software,3
2015-02-26 09:00:00,Streeplex,Service,4


### Filtering and grouping with .map()
- Create a Boolean Series of titanic['age'] < 10 and call .map with {True:'under 10', False:'over 10'}
- Group titanic by the under10 Series and then compute and print the mean of the 'survived' column
- Group titanic by the under10 Series as well as the 'pclass' column and then compute and print the mean of the 'survived' column

In [114]:
under_10 = (titanic['age']<10).map({True:'under 10', False:'over 10'})
under_10.head()

0     over 10
1    under 10
2    under 10
3     over 10
4     over 10
Name: age, dtype: object

In [115]:
survived_mean_1 = titanic.groupby(under_10)['survived'].mean()
survived_mean_1

age
over 10     0.366748
under 10    0.609756
Name: survived, dtype: float64

In [116]:
survived_mean_2 = titanic.groupby([under_10, 'pclass'])['survived'].mean()
survived_mean_2

age       pclass
over 10   1         0.617555
          2         0.380392
          3         0.238897
under 10  1         0.750000
          2         1.000000
          3         0.446429
Name: survived, dtype: float64