In [1]:
#import libraries

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from collections import Counter
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler
from sklearn.metrics import recall_score
%matplotlib inline

In [2]:
#import training data
df_train = pd.read_csv('./data/train.csv')
df_test  = pd.read_csv('./data/test.csv')

In [3]:
continuous_variables = ['age', 
                        'wage_per_hour',
                        'capital_gains',
                        'capital_losses',
                        'dividend_from_Stocks', 
                        'weeks_worked_in_year'
                       ]

# Define functions 
def summary(df):
    """
    Prints summary of data
    
    Arguments: df - pd.DataFrame
    
    Returns: None
    
    """
    
    print("number of rows: %s" % len(df.index))
    print("number of columns: %s" % len(df.columns.values))
    print("List of columns %s" % df.columns.values)
    display(df.head())
    
    return None
    
def convert_income_level(df):
    """
    Converts income_level_column to 0/1
    
    Arguments: df - pd.DataFrame
    
    Returns: x - pd.DataFrame
    
    """
    
    #Assign df to x
    x = df
    x['income_level'] = x['income_level'].astype(str)
    x['income_level'] = np.where(x['income_level'] == '-50000',0,1)
    
    return x

def remove_columns_with_null_values(df,threshold):
    """
    Removes columns with missing values over a certain percent
    
    Arguments: df:pd.DataFrame, threshold:float
    
    Returns: x:pd.DataFrame
    
    """
    
    #Assign df to x
    x = df

    #select columns with more than threshold
    x = x.dropna(thresh=threshold*len(x), axis=1)
    #replace all other missing values with string
    x = x.replace(np.nan, 'null_value', regex=True)
    
    return x

def group_low_frequency_categories(df,threshold):
    """
    Groups low frequency categories with less 
    than threshold
    
    Arguments: df:pd.DataFrame,threshold:float
    
    Returns: x:pd.DataFrame
    
    """
    
    #Assign df to x
    x = df
    
    categories = x.drop(continuous_variables,axis=1)
    cols = categories.columns.values
    for col in cols:
        counts= categories[col].value_counts()
        length = categories[col].count()
        values = categories[col].unique()
        for value in values:
            if counts[value] < (length * threshold):
                categories[col].replace(value,'Other',inplace=True)
        for col in categories.columns.values:
            x[col] = categories[col]
            
    return x

def clean_string_values(df):
    """
    Strips all whitespace, converts to lowercase and replaces spaces
    
    Arguments: df - pd.DataFrame
    
    Returns: x - pd.DataFrame
    
    """
    
    #Assign df to x 
    x = df 
    
    for col in x.columns.values:
        if x[col].dtypes == 'object':
            x[col] = x[col].str.strip()
            x[col] = x[col].str.lower()
            
    return x            
        

def scale_numeric_columns(df):
    """
    Used to scale numeric columns for PCA
    
    Arguments: df - pd.DataFrame
    
    Returns: x - pd.DataFrame
    
    """
    
    #Assign df to x
    x = df
        
    scaler = MinMaxScaler()
    numeric = x[continuous_variables]
    scaler.fit(numeric)
    numeric = scaler.transform(numeric)
    numeric = pd.DataFrame(numeric,
                           columns=continuous_variables)
    for col in numeric.columns.values:
        x[col] = numeric[col]
    return x

def encode_categorical_columns(df):
    """
    Encodes all cateogrical columns as numeric
    
    Arguments: df - pd.DataFrame
    
    Returns: x - pd.DataFrame
    
    """
    
    #Assign df to x
    x = df
    
    le = LabelEncoder()
    for col in x.columns.values:
        if x[col].dtypes == 'object':
            data = x[col]
            le.fit(data.values.astype(str))
            x[col] = le.transform(data.astype(str))
    
    return x     

def list_unique_values_for_columns(df):
    """
    Lists the unique values for all categorical columns
    
    Arguments: df - pd.DataFrame
    
    Returns: None
    
    """
    
    #Assign df to x
    x = df
    
    for col in x.columns.values:
        if x[col].dtypes == 'object':
            print("column: %s" % col)
            print("list of unique values: %s" % x[col].unique())
    
    return None


In [4]:
#combine both train and test to group low frequency columns 
#this avoids categories that may not have been grouped in train being grouped in test(see below)
group = pd.concat([df_train,df_test],axis=0) 
group.reset_index(inplace=True)
del group['index']
list_unique_values_for_columns(group)

column: class_of_worker
list of unique values: ['Not in universe' 'Self-employed-not incorporated' 'Private'
 'Local government' 'Federal government' 'Self-employed-incorporated'
 'State government' 'Never worked' 'Without pay' ' Private'
 ' Self-employed-not incorporated' ' Not in universe' ' Local government'
 ' State government' ' Never worked' ' Self-employed-incorporated'
 ' Federal government' ' Without pay']
column: education
list of unique values: ['High school graduate' 'Some college but no degree' '10th grade'
 'Children' 'Bachelors degree(BA AB BS)'
 'Masters degree(MA MS MEng MEd MSW MBA)' 'Less than 1st grade'
 'Associates degree-academic program' '7th and 8th grade'
 '12th grade no diploma' 'Associates degree-occup /vocational'
 'Prof school degree (MD DDS DVM LLB JD)' '5th or 6th grade' '11th grade'
 'Doctorate degree(PhD EdD)' '9th grade' '1st 2nd 3rd or 4th grade'
 ' 1st 2nd 3rd or 4th grade' ' Associates degree-occup /vocational'
 ' Children' ' High school graduate'
 

In [5]:
df_train.head()

Unnamed: 0,age,class_of_worker,industry_code,occupation_code,education,wage_per_hour,enrolled_in_edu_inst_lastwk,marital_status,major_industry_code,major_occupation_code,...,country_father,country_mother,country_self,citizenship,business_or_self_employed,fill_questionnaire_veteran_admin,veterans_benefits,weeks_worked_in_year,year,income_level
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,-50000
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,-50000
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,-50000
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,-50000
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,-50000


In [6]:
group = convert_income_level(group)
group = clean_string_values(group)
group = scale_numeric_columns(group)
group = remove_columns_with_null_values(group,0.95)
group = group_low_frequency_categories(group,0.05)
list_unique_values_for_columns(group)

column: class_of_worker
list of unique values: ['not in universe' 'Other' 'private']
column: industry_code
list of unique values: [0 'Other' 33]
column: occupation_code
list of unique values: [0 'Other']
column: education
list of unique values: ['high school graduate' 'some college but no degree' 'Other' 'children'
 'bachelors degree(ba ab bs)']
column: enrolled_in_edu_inst_lastwk
list of unique values: ['not in universe' 'Other']
column: marital_status
list of unique values: ['widowed' 'divorced' 'never married' 'married-civilian spouse present'
 'Other']
column: major_industry_code
list of unique values: ['not in universe or children' 'Other' 'retail trade']
column: major_occupation_code
list of unique values: ['not in universe' 'precision production craft & repair'
 'professional specialty' 'executive admin and managerial' 'Other'
 'adm support including clerical' 'other service' 'sales']
column: race
list of unique values: ['white' 'Other' 'black']
column: hispanic_origin
list of u

In [7]:
group.head()

Unnamed: 0,age,class_of_worker,industry_code,occupation_code,education,wage_per_hour,enrolled_in_edu_inst_lastwk,marital_status,major_industry_code,major_occupation_code,...,country_father,country_mother,country_self,citizenship,business_or_self_employed,fill_questionnaire_veteran_admin,veterans_benefits,weeks_worked_in_year,year,income_level
0,0.811111,not in universe,0,0,high school graduate,0.0,not in universe,widowed,not in universe or children,not in universe,...,united-states,united-states,united-states,native- born in the united states,0,not in universe,2,0.0,95,0
1,0.644444,Other,Other,Other,some college but no degree,0.0,not in universe,divorced,Other,precision production craft & repair,...,united-states,united-states,united-states,native- born in the united states,0,not in universe,2,1.0,94,0
2,0.2,not in universe,0,0,Other,0.0,Other,never married,not in universe or children,not in universe,...,Other,Other,Other,foreign born- not a citizen of u s,0,not in universe,2,0.0,95,0
3,0.1,not in universe,0,0,children,0.0,not in universe,never married,not in universe or children,not in universe,...,united-states,united-states,united-states,native- born in the united states,0,not in universe,0,0.0,94,0
4,0.111111,not in universe,0,0,children,0.0,not in universe,never married,not in universe or children,not in universe,...,united-states,united-states,united-states,native- born in the united states,0,not in universe,0,0.0,94,0


In [8]:
print(Counter(group['income_level']))

Counter({0: 280717, 1: 18568})


In [9]:
group = encode_categorical_columns(group)

In [10]:
group.head()

Unnamed: 0,age,class_of_worker,industry_code,occupation_code,education,wage_per_hour,enrolled_in_edu_inst_lastwk,marital_status,major_industry_code,major_occupation_code,...,country_father,country_mother,country_self,citizenship,business_or_self_employed,fill_questionnaire_veteran_admin,veterans_benefits,weeks_worked_in_year,year,income_level
0,0.811111,1,0,0,3,0.0,1,4,1,3,...,2,1,1,2,0,1,1,0.0,95,0
1,0.644444,0,2,1,4,0.0,1,1,0,5,...,2,1,1,2,0,1,1,1.0,94,0
2,0.2,1,0,0,0,0.0,0,3,1,3,...,0,0,0,1,0,1,1,0.0,95,0
3,0.1,1,0,0,2,0.0,1,3,1,3,...,2,1,1,2,0,1,0,0.0,94,0
4,0.111111,1,0,0,2,0.0,1,3,1,3,...,2,1,1,2,0,1,0,0.0,94,0


In [11]:
train_length = len(df_train.index)
#split back into train and test
train = pd.DataFrame()
test  = pd.DataFrame()

if group.shape[0] > train_length: # len(df) > 10 would also work
    train = group[:train_length]
    test  = group[train_length:]
    
print(len(train.index))
print(len(test.index))

199523
99762


In [12]:
X_train,y_train = train.drop(['income_level'],axis=1),train['income_level']
X_test, y_test  = test.drop(['income_level'],axis=1),test['income_level']

In [13]:
rf = RandomForestClassifier(n_estimators=100,random_state=10,n_jobs=-1)
rf.fit(X_train,y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=10,
            verbose=0, warm_start=False)

In [14]:
print(rf.score(X_test,y_test))
print(recall_score(y_test,rf.predict(X_test)))

0.951203865199
0.415130940834


In [15]:
#oversample the data
X_resampled, y_resampled = SMOTE(random_state=0,ratio=1.0).fit_sample(X_train, y_train)

rf.fit(X_resampled,y_resampled)
print(rf.score(X_test,y_test))
print(recall_score(y_test,rf.predict(X_test)))

0.937511276839
0.541383769803


In [16]:
#undersample the data
rus = RandomUnderSampler(random_state=0)
X_usampled,y_usampled = rus.fit_sample(X_train,y_train)
rf.fit(X_usampled,y_usampled)
print(rf.score(X_test,y_test))
print(recall_score(y_test,rf.predict(X_test)))

0.842204446583
0.887487875849


In [17]:
imbalance_test = test[test['income_level'] == 1]
print(len(imbalance_test.index))
X_itest, y_itest  = imbalance_test.drop(['income_level'],axis=1),imbalance_test['income_level']

6186


In [18]:
prediction = rf.predict(X_itest)
print(rf.score(X_itest,y_itest))
pred = pd.DataFrame(prediction)
print(Counter(prediction))

0.887487875849
Counter({1: 5490, 0: 696})


In [19]:
params = {
    'min_samples_leaf':range(1,3),
    'min_samples_split':range(2,10,2)
    
}

grid_search = GridSearchCV(rf,param_grid=params)
grid_search.fit(X_train,y_train)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.9534890714353733
Best parameters: {'min_samples_leaf': 2, 'min_samples_split': 6}
