# Data Aggregation using Pandas

In [2]:
import pandas as pd
data = pd.read_csv("national-health-and-nutrition-examination-survey/questionnaire.csv")

### Using Pandas functions to summarise data in a DataFrame

In [3]:
data.describe() # obtain a variety of basic statistical information

Unnamed: 0,SEQN,ACD011A,ACD011B,ACD011C,ACD040,ACD110,ALQ101,ALQ110,ALQ120Q,ALQ120U,...,WHD080U,WHD080L,WHD110,WHD120,WHD130,WHD140,WHQ150,WHQ030M,WHQ500,WHQ520
count,10175.0,5759.0,16.0,171.0,2374.0,1007.0,5421.0,1631.0,4479.0,3593.0,...,15.0,29.0,4179.0,5024.0,2765.0,6103.0,6020.0,1478.0,1478.0,1478.0
mean,78644.0,1.0,8.0,9.0,3.101095,2.956306,1.311197,1.594727,4.70931,1.921514,...,35.0,40.0,412.158411,568.528463,377.322966,315.623464,571.309635,2.583221,2.299729,1.746955
std,2937.413829,0.0,0.0,0.0,1.511821,1.733794,0.545023,0.615303,34.428362,0.853701,...,0.0,0.0,1508.005974,1977.085854,1726.34474,1077.06875,7269.067701,0.784612,1.213281,0.70943
min,73557.0,1.0,8.0,9.0,1.0,1.0,1.0,1.0,0.0,1.0,...,35.0,40.0,75.0,55.0,50.0,85.0,10.0,1.0,1.0,1.0
25%,76100.5,1.0,8.0,9.0,2.0,1.0,1.0,1.0,1.0,1.0,...,35.0,40.0,140.0,125.0,63.0,155.0,25.0,3.0,1.0,1.0
50%,78644.0,1.0,8.0,9.0,3.0,3.0,1.0,2.0,2.0,2.0,...,35.0,40.0,165.0,150.0,66.0,185.0,38.0,3.0,2.0,2.0
75%,81187.5,1.0,8.0,9.0,4.0,5.0,2.0,2.0,4.0,3.0,...,35.0,40.0,198.0,180.0,70.0,224.0,53.0,3.0,3.0,2.0
max,83731.0,1.0,8.0,9.0,9.0,5.0,9.0,9.0,999.0,3.0,...,35.0,40.0,9999.0,9999.0,9999.0,9999.0,99999.0,9.0,9.0,9.0


In [6]:
# using .describe() on a single variable basis
data['WHD120'].describe()

count    5024.000000
mean      568.528463
std      1977.085854
min        55.000000
25%       125.000000
50%       150.000000
75%       180.000000
max      9999.000000
Name: WHD120, dtype: float64

In [7]:
data['WHD120']

0         200.0
1         250.0
2         190.0
3           NaN
4         135.0
5         195.0
6           NaN
7         127.0
8         185.0
9         120.0
10        160.0
11          NaN
12          NaN
13          NaN
14        180.0
15          NaN
16          NaN
17        123.0
18          NaN
19          NaN
20        143.0
21          NaN
22          NaN
23        190.0
24        130.0
25          NaN
26          NaN
27          NaN
28        240.0
29          NaN
          ...  
10145     113.0
10146       NaN
10147       NaN
10148     125.0
10149       NaN
10150       NaN
10151     230.0
10152       NaN
10153       NaN
10154     140.0
10155     165.0
10156     120.0
10157       NaN
10158     130.0
10159       NaN
10160    9999.0
10161     120.0
10162       NaN
10163     198.0
10164     163.0
10165       NaN
10166     200.0
10167     140.0
10168       NaN
10169     160.0
10170       NaN
10171       NaN
10172     135.0
10173       NaN
10174       NaN
Name: WHD120, Length: 10

In [10]:
print(data['WHD120'].min())
print(data['WHD120'].count())

55.0
5024


### Dealing with missing values

In [11]:
data.isnull().sum() # find out how many variables in our Dataframe contains NaN values

SEQN           0
ACD011A     4416
ACD011B    10159
ACD011C    10004
ACD040      7801
ACD110      9168
ALQ101      4754
ALQ110      8544
ALQ120Q     5696
ALQ120U     6582
ALQ130      6579
ALQ141Q     6580
ALQ141U     8711
ALQ151      5698
ALQ160      8309
BPQ020      3711
BPQ030      8001
BPD035      8011
BPQ040A     8001
BPQ050A     8360
BPQ056      3711
BPD058      8596
BPQ059      3711
BPQ080      3711
BPQ060      5748
BPQ070      5555
BPQ090D     5555
BPQ100D     8725
CBD070       123
CBD090       132
           ...  
WHQ070      4501
WHD080A     8482
WHD080B     9292
WHD080C     9373
WHD080D     8345
WHD080E     9776
WHD080F     9942
WHD080G    10060
WHD080H    10061
WHD080I    10091
WHD080J     9975
WHD080K    10149
WHD080M     9142
WHD080N    10000
WHD080O     9486
WHD080P    10156
WHD080Q     9054
WHD080R     9153
WHD080S     9232
WHD080T     9179
WHD080U    10160
WHD080L    10146
WHD110      5996
WHD120      5151
WHD130      7410
WHD140      4072
WHQ150      4155
WHQ030M     86

In [13]:
data['WHD120'].isnull().sum()

5151

Possible actions for missing values:
    * Remove complete rows which contains NaN
    * Replace NaN with a value of our choice
    * Replace specific values with NaN

### Completely remove rows with NaNs

In [15]:
data1 = pd.read_csv("national-health-and-nutrition-examination-survey/questionnaire.csv")
print(data1.shape)
data1.dropna(inplace=True)
print(data1.shape)

(10175, 953)
(0, 953)


In [17]:
data.isnull().sum()

SEQN           0
ACD011A     4416
ACD011B    10159
ACD011C    10004
ACD040      7801
ACD110      9168
ALQ101      4754
ALQ110      8544
ALQ120Q     5696
ALQ120U     6582
ALQ130      6579
ALQ141Q     6580
ALQ141U     8711
ALQ151      5698
ALQ160      8309
BPQ020      3711
BPQ030      8001
BPD035      8011
BPQ040A     8001
BPQ050A     8360
BPQ056      3711
BPD058      8596
BPQ059      3711
BPQ080      3711
BPQ060      5748
BPQ070      5555
BPQ090D     5555
BPQ100D     8725
CBD070       123
CBD090       132
           ...  
WHQ070      4501
WHD080A     8482
WHD080B     9292
WHD080C     9373
WHD080D     8345
WHD080E     9776
WHD080F     9942
WHD080G    10060
WHD080H    10061
WHD080I    10091
WHD080J     9975
WHD080K    10149
WHD080M     9142
WHD080N    10000
WHD080O     9486
WHD080P    10156
WHD080Q     9054
WHD080R     9153
WHD080S     9232
WHD080T     9179
WHD080U    10160
WHD080L    10146
WHD110      5996
WHD120      5151
WHD130      7410
WHD140      4072
WHQ150      4155
WHQ030M     86

In [24]:
data2 = pd.read_csv("national-health-and-nutrition-examination-survey/questionnaire.csv")
print(data2.shape)
data2 = data2[(data2['WHD120'].notnull())]
print(data2.shape)

(10175, 953)
(5024, 953)


### Replace NaN with a value of our choice

In [33]:
data['OSQ140Q'].describe()
data['OSQ140Q']

count    220.000000
mean      63.422727
std      235.065829
min        1.000000
25%        1.000000
50%        3.000000
75%        6.000000
max      999.000000
Name: OSQ140Q, dtype: float64

In [37]:
data['OSQ140Q'].fillna(0, inplace=True)
data['OSQ140Q'].describe()

count    10175.000000
mean         1.371302
std         35.700267
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max        999.000000
Name: OSQ140Q, dtype: float64

### Replace specific values with NaN

In [41]:
data3 = pd.read_csv("national-health-and-nutrition-examination-survey/questionnaire.csv")

import numpy as np
data3['ACD011A'].replace(0, np.NaN, inplace=True)
data3.describe()

Unnamed: 0,SEQN,ACD011A,ACD011B,ACD011C,ACD040,ACD110,ALQ101,ALQ110,ALQ120Q,ALQ120U,...,WHD080U,WHD080L,WHD110,WHD120,WHD130,WHD140,WHQ150,WHQ030M,WHQ500,WHQ520
count,10175.0,5759.0,16.0,171.0,2374.0,1007.0,5421.0,1631.0,4479.0,3593.0,...,15.0,29.0,4179.0,5024.0,2765.0,6103.0,6020.0,1478.0,1478.0,1478.0
mean,78644.0,1.0,8.0,9.0,3.101095,2.956306,1.311197,1.594727,4.70931,1.921514,...,35.0,40.0,412.158411,568.528463,377.322966,315.623464,571.309635,2.583221,2.299729,1.746955
std,2937.413829,0.0,0.0,0.0,1.511821,1.733794,0.545023,0.615303,34.428362,0.853701,...,0.0,0.0,1508.005974,1977.085854,1726.34474,1077.06875,7269.067701,0.784612,1.213281,0.70943
min,73557.0,1.0,8.0,9.0,1.0,1.0,1.0,1.0,0.0,1.0,...,35.0,40.0,75.0,55.0,50.0,85.0,10.0,1.0,1.0,1.0
25%,76100.5,1.0,8.0,9.0,2.0,1.0,1.0,1.0,1.0,1.0,...,35.0,40.0,140.0,125.0,63.0,155.0,25.0,3.0,1.0,1.0
50%,78644.0,1.0,8.0,9.0,3.0,3.0,1.0,2.0,2.0,2.0,...,35.0,40.0,165.0,150.0,66.0,185.0,38.0,3.0,2.0,2.0
75%,81187.5,1.0,8.0,9.0,4.0,5.0,2.0,2.0,4.0,3.0,...,35.0,40.0,198.0,180.0,70.0,224.0,53.0,3.0,3.0,2.0
max,83731.0,1.0,8.0,9.0,9.0,5.0,9.0,9.0,999.0,3.0,...,35.0,40.0,9999.0,9999.0,9999.0,9999.0,99999.0,9.0,9.0,9.0


### Categorical variables

In [51]:
# What are all the unique values for column 'BPQ090D'? 
pd.unique(data3['BPQ090D'])

array([ 1., nan,  2.,  9.])

In [56]:
grouped_data = data3.groupby(['BPQ090D', 'BPQ070'])
grouped_data.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,SEQN,SEQN,SEQN,SEQN,SEQN,SEQN,SEQN,SEQN,ACD011A,ACD011A,...,WHQ500,WHQ500,WHQ520,WHQ520,WHQ520,WHQ520,WHQ520,WHQ520,WHQ520,WHQ520
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
BPQ090D,BPQ070,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1.0,1.0,1202.0,78672.460067,2977.649454,73557.0,76134.5,78715.0,81341.25,83723.0,860.0,1.0,...,,,0.0,,,,,,,
1.0,2.0,149.0,78670.630872,2935.447782,73562.0,76147.0,78591.0,81282.0,83590.0,101.0,1.0,...,,,0.0,,,,,,,
1.0,3.0,55.0,78692.290909,2906.4833,73858.0,76024.5,78337.0,80727.0,83676.0,32.0,1.0,...,,,0.0,,,,,,,
1.0,4.0,35.0,78141.657143,2388.444817,73779.0,76863.0,78268.0,79623.0,82709.0,21.0,1.0,...,,,0.0,,,,,,,
1.0,9.0,9.0,79816.222222,3483.282854,73693.0,76586.0,81143.0,82771.0,82967.0,6.0,1.0,...,,,0.0,,,,,,,
2.0,1.0,1793.0,78679.393196,2876.497496,73561.0,76201.0,78852.0,81103.0,83729.0,1167.0,1.0,...,,,0.0,,,,,,,
2.0,2.0,666.0,78760.90991,2929.527756,73564.0,76265.25,78856.0,81290.75,83702.0,437.0,1.0,...,,,0.0,,,,,,,
2.0,3.0,480.0,78658.1375,2932.49036,73565.0,76045.25,78687.5,81083.75,83727.0,298.0,1.0,...,,,0.0,,,,,,,
2.0,4.0,204.0,78724.568627,2883.295361,73567.0,76174.5,78526.5,81168.25,83664.0,128.0,1.0,...,,,0.0,,,,,,,
2.0,9.0,20.0,80283.1,3159.02803,73657.0,78352.5,80682.5,82873.5,83689.0,12.0,1.0,...,,,0.0,,,,,,,


In [57]:
data3.groupby(['BPQ090D', 'BPQ070'])['ACD011A'].count()

BPQ090D  BPQ070
1.0      1.0        860
         2.0        101
         3.0         32
         4.0         21
         9.0          6
2.0      1.0       1167
         2.0        437
         3.0        298
         4.0        128
         9.0         12
9.0      1.0          2
         2.0          2
         4.0          0
         9.0          1
Name: ACD011A, dtype: int64

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
5        NaN
6        NaN
7        1.0
8        NaN
9        1.0
10       1.0
11       1.0
12       NaN
13       1.0
14       1.0
15       1.0
16       1.0
17       NaN
18       NaN
19       1.0
20       NaN
21       NaN
22       1.0
23       1.0
24       NaN
25       1.0
26       1.0
27       1.0
28       NaN
29       NaN
        ... 
10145    1.0
10146    NaN
10147    1.0
10148    NaN
10149    1.0
10150    NaN
10151    1.0
10152    1.0
10153    NaN
10154    1.0
10155    1.0
10156    NaN
10157    1.0
10158    1.0
10159    1.0
10160    NaN
10161    1.0
10162    NaN
10163    1.0
10164    1.0
10165    NaN
10166    NaN
10167    1.0
10168    NaN
10169    NaN
10170    NaN
10171    NaN
10172    1.0
10173    NaN
10174    NaN
Name: ACD011A, Length: 10175, dtype: float64