# Lab practice: Essential functionality in Pandas

In [1]:
import pandas as pd
import numpy as np

## Part I: Creating dataframes from generated data

### Problem 1: Creating a dataframe from a sequence

In [4]:
# SOLUTION

df_seq = pd.DataFrame(
    np.arange(100).reshape((10,10)),
    columns = ['col_' + str(i) for i in range(1,11)]
)

df_seq

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89
9,90,91,92,93,94,95,96,97,98,99


### Problem 2: Creating a dataframe from random integers 

In [7]:
# SOLUTION

df_rand = pd.DataFrame(np.random.randint(low=1, high=101, size=(5, 10)),
    columns = ['col_' + str(i) for i in range(1,11)],
    index = list('abcde')
)

df_rand

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10
a,21,37,4,21,70,24,89,79,66,92
b,93,99,9,19,35,25,29,86,86,16
c,64,50,27,11,99,18,33,11,79,5
d,66,70,66,80,5,25,14,89,88,98
e,16,53,66,48,26,82,5,59,26,99


### Problem 3: Dropping a column

Create a new DataFrame by dropping the column *col_10* from the random integers DataFrame.

In [9]:
# SOLUTION

df_rand.drop('col_10', axis=1)

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
a,21,37,4,21,70,24,89,79,66
b,93,99,9,19,35,25,29,86,86
c,64,50,27,11,99,18,33,11,79
d,66,70,66,80,5,25,14,89,88
e,16,53,66,48,26,82,5,59,26


### Problems 4-5: Filtering  

In [12]:
# SOLUTION

df_rand[df_rand['col_1'] > 50]

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10
b,93,99,9,19,35,25,29,86,86,16
c,64,50,27,11,99,18,33,11,79,5
d,66,70,66,80,5,25,14,89,88,98


Retrieve all rows and the last 5 columns where the column 'col_6' value is even.

In [15]:
# SOLUTION

df_rand

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10
a,21,37,4,21,70,24,89,79,66,92
b,93,99,9,19,35,25,29,86,86,16
c,64,50,27,11,99,18,33,11,79,5
d,66,70,66,80,5,25,14,89,88,98
e,16,53,66,48,26,82,5,59,26,99


In [16]:
# SOLUTION 1

df_rand.iloc[:,-5:][df_rand['col_6'] % 2 == 0]

Unnamed: 0,col_6,col_7,col_8,col_9,col_10
a,24,89,79,66,92
c,18,33,11,79,5
e,82,5,59,26,99


In [17]:
# SOLUTION 2 (recommended)

df_rand[df_rand['col_6'] % 2 == 0].iloc[:,-5:]

Unnamed: 0,col_6,col_7,col_8,col_9,col_10
a,24,89,79,66,92
c,18,33,11,79,5
e,82,5,59,26,99


## Part II: Working with tabular data

In [18]:
df_original = pd.read_csv('data/nyc_deaths.csv')
df_original.head()

Unnamed: 0,Year,Cause of Death,Sex,Ethnicity,Count,Death Rate,Age Adjusted Death Rate
0,2010,Influenza (Flu) and Pneumonia (J09-J18),F,Hispanic,228,18.7,23.1
1,2008,"Accidents Except Drug Posioning (V01-X39, X43,...",F,Hispanic,68,5.8,6.6
2,2013,"Accidents Except Drug Posioning (V01-X39, X43,...",M,White Non-Hispanic,271,20.1,17.9
3,2010,Cerebrovascular Disease (Stroke: I60-I69),M,Hispanic,140,12.3,21.4
4,2009,"Assault (Homicide: Y87.1, X85-Y09)",M,Black Non-Hispanic,255,30.0,30.0


Number of rows and attributes:

In [19]:
df_original.shape

(1094, 7)

### Problem 6: Data cleaning 

In [22]:
# SOLUTION

df_original = df_original.drop(['Death Rate', 'Age Adjusted Death Rate'], axis=1)
df_original

Unnamed: 0,Year,Cause of Death,Sex,Ethnicity,Count
0,2010,Influenza (Flu) and Pneumonia (J09-J18),F,Hispanic,228
1,2008,"Accidents Except Drug Posioning (V01-X39, X43,...",F,Hispanic,68
2,2013,"Accidents Except Drug Posioning (V01-X39, X43,...",M,White Non-Hispanic,271
3,2010,Cerebrovascular Disease (Stroke: I60-I69),M,Hispanic,140
4,2009,"Assault (Homicide: Y87.1, X85-Y09)",M,Black Non-Hispanic,255
...,...,...,...,...,...
1089,2012,Influenza (Flu) and Pneumonia (J09-J18),F,Not Stated/Unknown,6
1090,2014,"Accidents Except Drug Posioning (V01-X39, X43,...",F,White Non-Hispanic,169
1091,2009,Malignant Neoplasms (Cancer: C00-C97),M,White Non-Hispanic,3236
1092,2009,"Intentional Self-Harm (Suicide: X60-X84, Y87.0)",M,White Non-Hispanic,191


In [23]:
# SOLUTION

df_original = df_original[df_original['Count'] != "."]
df_original

Unnamed: 0,Year,Cause of Death,Sex,Ethnicity,Count
0,2010,Influenza (Flu) and Pneumonia (J09-J18),F,Hispanic,228
1,2008,"Accidents Except Drug Posioning (V01-X39, X43,...",F,Hispanic,68
2,2013,"Accidents Except Drug Posioning (V01-X39, X43,...",M,White Non-Hispanic,271
3,2010,Cerebrovascular Disease (Stroke: I60-I69),M,Hispanic,140
4,2009,"Assault (Homicide: Y87.1, X85-Y09)",M,Black Non-Hispanic,255
...,...,...,...,...,...
1089,2012,Influenza (Flu) and Pneumonia (J09-J18),F,Not Stated/Unknown,6
1090,2014,"Accidents Except Drug Posioning (V01-X39, X43,...",F,White Non-Hispanic,169
1091,2009,Malignant Neoplasms (Cancer: C00-C97),M,White Non-Hispanic,3236
1092,2009,"Intentional Self-Harm (Suicide: X60-X84, Y87.0)",M,White Non-Hispanic,191


In [24]:
# SOLUTION

df = df_original.copy()
df.loc[:, 'Count'] = pd.to_numeric(df['Count'])

In [25]:
# SOLUTION

df

Unnamed: 0,Year,Cause of Death,Sex,Ethnicity,Count
0,2010,Influenza (Flu) and Pneumonia (J09-J18),F,Hispanic,228
1,2008,"Accidents Except Drug Posioning (V01-X39, X43,...",F,Hispanic,68
2,2013,"Accidents Except Drug Posioning (V01-X39, X43,...",M,White Non-Hispanic,271
3,2010,Cerebrovascular Disease (Stroke: I60-I69),M,Hispanic,140
4,2009,"Assault (Homicide: Y87.1, X85-Y09)",M,Black Non-Hispanic,255
...,...,...,...,...,...
1089,2012,Influenza (Flu) and Pneumonia (J09-J18),F,Not Stated/Unknown,6
1090,2014,"Accidents Except Drug Posioning (V01-X39, X43,...",F,White Non-Hispanic,169
1091,2009,Malignant Neoplasms (Cancer: C00-C97),M,White Non-Hispanic,3236
1092,2009,"Intentional Self-Harm (Suicide: X60-X84, Y87.0)",M,White Non-Hispanic,191


In [26]:
# SOLUTION

df.describe()

Unnamed: 0,Year,Count
count,956.0,956.0
mean,2010.59205,444.558577
std,2.309303,880.107843
min,2007.0,5.0
25%,2009.0,36.0
50%,2011.0,148.5
75%,2013.0,307.25
max,2014.0,7050.0


### Problem 7: How many male records and how many female records are there in the data?

In [28]:
# SOLUTION

df[df['Sex'] == 'M'].shape[0]

493

In [29]:
# SOLUTION

df[df['Sex'] == 'F'].shape[0]

463

### Problem 8: What ethnicities are included in the data for females?

In [31]:
# SOLUTION

df[df['Sex'] == 'F']['Ethnicity'].unique()

array(['Hispanic', 'Asian and Pacific Islander', 'White Non-Hispanic',
       'Black Non-Hispanic', 'Not Stated/Unknown',
       'Other Race/ Ethnicity'], dtype=object)

In [34]:
# SOLUTION

cause_of_death = df.groupby(["Cause of Death"]).sum()
cause_of_death[['Count']].sort_values(by="Count", ascending=False)[:3]

Unnamed: 0_level_0,Count
Cause of Death,Unnamed: 1_level_1
"Diseases of Heart (I00-I09, I11, I13, I20-I51)",147551
Malignant Neoplasms (Cancer: C00-C97),106367
All Other Causes,77999


### Problem 10:  Are there differences in the leading causes of death among males and females? 

In [38]:
# SOLUTION

m_df = df[df['Sex'] == 'M']
m_df = df[df['Sex'] == 'M'].groupby(["Cause of Death"]).sum()
m_df = m_df.drop('All Other Causes')
m_leading = m_df[['Count']].sort_values(by="Count", ascending=False)[:10]
m_leading

Unnamed: 0_level_0,Count
Cause of Death,Unnamed: 1_level_1
"Diseases of Heart (I00-I09, I11, I13, I20-I51)",68799
Malignant Neoplasms (Cancer: C00-C97),52490
Influenza (Flu) and Pneumonia (J09-J18),8898
Diabetes Mellitus (E10-E14),6569
Chronic Lower Respiratory Diseases (J40-J47),5995
Cerebrovascular Disease (Stroke: I60-I69),5360
"Accidents Except Drug Posioning (V01-X39, X43, X45-X59, Y85-Y86)",5191
"Mental and Behavioral Disorders due to Accidental Poisoning and Other Psychoactive Substance Use (F11-F16, F18-F19, X40-X42, X44)",3760
Human Immunodeficiency Virus Disease (HIV: B20-B24),3633
"Essential Hypertension and Renal Diseases (I10, I12)",2645


In [40]:
# SOLUTION

f_df =  df[df['Sex'] == 'F'].groupby(["Cause of Death"]).sum()
f_df = f_df.drop('All Other Causes')
f_leading = f_df[['Count']].sort_values(by="Count", ascending=False)[:10]
f_leading

Unnamed: 0_level_0,Count
Cause of Death,Unnamed: 1_level_1
"Diseases of Heart (I00-I09, I11, I13, I20-I51)",78752
Malignant Neoplasms (Cancer: C00-C97),53877
Influenza (Flu) and Pneumonia (J09-J18),9780
Cerebrovascular Disease (Stroke: I60-I69),7581
Diabetes Mellitus (E10-E14),7225
Chronic Lower Respiratory Diseases (J40-J47),7219
"Essential Hypertension and Renal Diseases (I10, I12)",4310
Alzheimer's Disease (G30),2980
"Accidents Except Drug Posioning (V01-X39, X43, X45-X59, Y85-Y86)",2276
Human Immunodeficiency Virus Disease (HIV: B20-B24),1803


In [41]:
# SOLUTION

pd.merge(m_leading, f_leading, suffixes=('_male', '_female'), on="Cause of Death", how='outer')

Unnamed: 0_level_0,Count_male,Count_female
Cause of Death,Unnamed: 1_level_1,Unnamed: 2_level_1
"Diseases of Heart (I00-I09, I11, I13, I20-I51)",68799.0,78752.0
Malignant Neoplasms (Cancer: C00-C97),52490.0,53877.0
Influenza (Flu) and Pneumonia (J09-J18),8898.0,9780.0
Diabetes Mellitus (E10-E14),6569.0,7225.0
Chronic Lower Respiratory Diseases (J40-J47),5995.0,7219.0
Cerebrovascular Disease (Stroke: I60-I69),5360.0,7581.0
"Accidents Except Drug Posioning (V01-X39, X43, X45-X59, Y85-Y86)",5191.0,2276.0
"Mental and Behavioral Disorders due to Accidental Poisoning and Other Psychoactive Substance Use (F11-F16, F18-F19, X40-X42, X44)",3760.0,
Human Immunodeficiency Virus Disease (HIV: B20-B24),3633.0,1803.0
"Essential Hypertension and Renal Diseases (I10, I12)",2645.0,4310.0
