In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import requests
import numpy as np

In [3]:
df = pd.read_csv('mock_student_data.csv', index_col = 'ID')

The first task is to load the file and generate summary statistics for each field as well as probability distributions or histograms. The summary statistics should include mean, median, mode, standard deviation, as well as the number of missing values for each field.


In [58]:
cols = ['Age', 'GPA', 'Days_missed']

In [59]:
df.describe()

Unnamed: 0,Age,GPA,Days_missed
count,771.0,779.0,808.0
mean,16.996109,2.988447,18.011139
std,1.458067,0.818249,9.629371
min,15.0,2.0,2.0
25%,16.0,2.0,9.0
50%,17.0,3.0,18.0
75%,18.0,4.0,27.0
max,19.0,4.0,34.0


In [4]:
df.median()

Age            17
GPA             3
Days_missed    18
dtype: float64

In [60]:
df.mode()

Unnamed: 0,First_name,Last_name,State,Gender,Age,GPA,Days_missed,Graduated
0,Amy,Ross,Texas,Female,15.0,2.0,6,Yes
1,,,,,,,14,
2,,,,,,,31,


In [61]:
df.isnull().sum()

First_name       0
Last_name        0
State          116
Gender         226
Age            229
GPA            221
Days_missed    192
Graduated        0
dtype: int64

In [62]:
df.hist()
plt.savefig('histograms')

You will notice that a lot of students are missing gender values . Your task is to infer the gender of the student based on their name. Please use the API at www.genderize.io to infer the gender of each student and generate a new data file.

In [63]:
def genderize(name):
    '''
    Given a list of names (strings), calls the Genderize.io API
    and returns results as Json
    '''
    args = {'name[]': name}
    r = requests.get('https://api.genderize.io/', params=args)
    return r.json()

In [None]:
'''
Get a list of the unique names from the dataset and 
create a dictionary mapping names to predicted gender using
the Genderize API. Process names 10 at a time since that is 
the max allowed by the API.
'''
names = list(df['First_name'].unique())
x = 0
name_dict = {}

while len(name_dict) < len(names):
    to_try = []
    for i in range(x,x+10):
        if i <= len(names):
            to_try.append(names[i])
    response = genderize(to_try)
    for r in response:
        name_dict[r['name']] = r['gender']
    x += 10


In [51]:
name_dict

{'Aaron': 'male',
 'Adam': 'male',
 'Alan': 'male',
 'Albert': 'male',
 'Alice': 'female',
 'Amanda': 'female',
 'Amy': 'female',
 'Andrea': 'female',
 'Andrew': 'male',
 'Angela': 'female',
 'Ann': 'female',
 'Anna': 'female',
 'Anne': 'female',
 'Annie': 'female',
 'Anthony': 'male',
 'Antonio': 'male',
 'Arthur': 'male',
 'Ashley': 'female',
 'Barbara': 'female',
 'Benjamin': 'male',
 'Betty': 'female',
 'Beverly': 'female',
 'Billy': 'male',
 'Bobby': 'male',
 'Bonnie': 'female',
 'Brandon': 'male',
 'Brenda': 'female',
 'Brian': 'male',
 'Bruce': 'male',
 'Carl': 'male',
 'Carlos': 'male',
 'Carol': 'female',
 'Carolyn': 'female',
 'Catherine': 'female',
 'Charles': 'male',
 'Cheryl': 'female',
 'Chris': 'male',
 'Christina': 'female',
 'Christine': 'female',
 'Christopher': 'male',
 'Clarence': 'male',
 'Craig': 'male',
 'Cynthia': 'female',
 'Daniel': 'male',
 'David': 'male',
 'Deborah': 'female',
 'Debra': 'female',
 'Denise': 'female',
 'Dennis': 'male',
 'Diana': 'female',
 

In [41]:
def get_gender(x):
    return name_dict[x]

In [53]:
# Fill missing gender with the pre
df_genderized = df.copy()
df_genderized['Gender'].fillna(df_genderized['First_name'].apply(get_gender), inplace=True)

In [54]:
df_genderized.to_csv('genderized_data')

You will also notice that some of the other attributes are missing. Your task is to fill in the missing values for Age, GPA, and Days_missed using the following approaches:
Fill in missing values with the mean of the values for that attribute
Fill in missing values with a class-conditional mean (where the class is whether they graduated or not).
Is there a better, more appropriate method for filling in the missing values? If yes, describe and implement it. 

In [83]:
# Approach A
df_a = df.copy()

for col in cols:
    df_a[col].fillna(df_a[col].mean(), inplace=True)

df_a.to_csv('approach_a')

In [91]:
# Approach B
df_b = df.copy()

for col in cols:
    df_b[col].fillna(df_b.groupby("Graduated")[col].transform("mean"), inplace=True)

df_b.to_csv('approach_b')

An alternative way to fill in the missing values would be to use linear regression. We could create a model trained using the complete observations, and then generate predicted values for the observations with missing data. However this could be problematic in the case where observations are missing multiple values. In that case, we could use the conditional mean as a fallback method.
