In [40]:
#import libraries

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from collections import Counter
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.metrics import recall_score
%matplotlib inline

In [41]:
#import training data
df_train = pd.read_csv('./data/train.csv')
df_test  = pd.read_csv('./data/test.csv')

In [72]:
continuous_variables = ['age', 
                        'wage_per_hour',
                        'capital_gains',
                        'capital_losses',
                        'dividend_from_Stocks', 
                        'weeks_worked_in_year'
                       ]

# Define functions 
def summary(df):
    """
    Prints summary of data
    
    Arguments: df - pd.DataFrame
    
    Returns: None
    
    """
    
    print("number of rows: %s" % len(df.index))
    print("number of columns: %s" % len(df.columns.values))
    print("List of columns %s" % df.columns.values)
    display(df.head())
    
    return None
    
def convert_income_level(df):
    """
    Converts income_level_column to 0/1
    
    Arguments: df - pd.DataFrame
    
    Returns: x - pd.DataFrame
    
    """
    
    #Assign df to x
    x = df
    x['income_level'] = x['income_level'].astype(str)
    x['income_level'] = np.where(x['income_level'] == '-50000',0,1)
    
    return x

def remove_columns_with_null_values(df,threshold):
    """
    Removes columns with missing values over a certain percent
    
    Arguments: df:pd.DataFrame, threshold:float
    
    Returns: x:pd.DataFrame
    
    """
    
    #Assign df to x
    x = df

    #select columns with more than threshold
    x = x.dropna(thresh=threshold*len(x), axis=1)
    #replace all other missing values with string
    x = x.replace(np.nan, 'null_value', regex=True)
    
    return x

def group_low_frequency_categories(df,threshold):
    """
    Groups low frequency categories with less 
    than threshold
    
    Arguments: df:pd.DataFrame,threshold:float
    
    Returns: x:pd.DataFrame
    
    """
    
    #Assign df to x
    x = df
    
    categories = x.drop(continuous_variables,axis=1)
    cols = categories.columns.values
    for col in cols:
        counts= categories[col].value_counts()
        length = categories[col].count()
        values = categories[col].unique()
        for value in values:
            if counts[value] < (length * threshold):
                categories[col].replace(value,'Other',inplace=True)
        for col in categories.columns.values:
            x[col] = categories[col]
            
    return x

def clean_string_values(df):
    """
    Strips all whitespace, converts to lowercase and replaces spaces
    
    Arguments: df - pd.DataFrame
    
    Returns: x - pd.DataFrame
    
    """
    
    #Assign df to x 
    x = df 
    
    for col in x.columns.values:
        if x[col].dtypes == 'object':
            x[col] = x[col].str.strip()
            x[col] = x[col].str.lower()
            
    return x            
        

def scale_numeric_columns(df):
    """
    Used to scale numeric columns for PCA
    
    Arguments: df - pd.DataFrame
    
    Returns: x - pd.DataFrame
    
    """
    
    #Assign df to x
    x = df
        
    scaler = MinMaxScaler()
    numeric = x[continuous_variables]
    scaler.fit(numeric)
    numeric = scaler.transform(numeric)
    numeric = pd.DataFrame(numeric,
                           columns=continuous_variables)
    for col in numeric.columns.values:
        x[col] = numeric[col]
    return x

def encode_categorical_columns(df):
    """
    Encodes all cateogrical columns as numeric
    
    Arguments: df - pd.DataFrame
    
    Returns: x - pd.DataFrame
    
    """
    
    #Assign df to x
    x = df
    
    le = LabelEncoder()
    for col in x.columns.values:
        if x[col].dtypes == 'object':
            data = x[col]
            le.fit(data.values.astype(str))
            x[col] = le.transform(data.astype(str))
    
    return x     

def list_unique_values_for_columns(df):
    """
    Lists the unique values for all categorical columns
    
    Arguments: df - pd.DataFrame
    
    Returns: None
    
    """
    
    #Assign df to x
    x = df
    
    for col in x.columns.values:
        if x[col].dtypes == 'object':
            print("column: %s" % col)
            print("list of unique values: %s" % x[col].unique())
    
    return None
    
    


In [73]:
#combine both train and test to group low frequency columns 
#this avoids categories that may not have been grouped in train being grouped in test(see below)
group = pd.concat([df_train,df_test],axis=0) 
group.reset_index(inplace=True)
del group['index']
group = clean_string_values(group)
list_unique_values_for_columns(group)

column: class_of_worker
list of unique values: ['not in universe' 'self-employed-not incorporated' 'private'
 'local government' 'federal government' 'self-employed-incorporated'
 'state government' 'never worked' 'without pay']
column: education
list of unique values: ['high school graduate' 'some college but no degree' '10th grade'
 'children' 'bachelors degree(ba ab bs)'
 'masters degree(ma ms meng med msw mba)' 'less than 1st grade'
 'associates degree-academic program' '7th and 8th grade'
 '12th grade no diploma' 'associates degree-occup /vocational'
 'prof school degree (md dds dvm llb jd)' '5th or 6th grade' '11th grade'
 'doctorate degree(phd edd)' '9th grade' '1st 2nd 3rd or 4th grade']
column: enrolled_in_edu_inst_lastwk
list of unique values: ['not in universe' 'high school' 'college or university']
column: marital_status
list of unique values: ['widowed' 'divorced' 'never married' 'married-civilian spouse present'
 'separated' 'married-spouse absent' 'married-a f spouse pre

In [74]:
group = convert_income_level(group)
group = scale_numeric_columns(group)
group = remove_columns_with_null_values(group,0.95)
group = group_low_frequency_categories(group,0.05)
list_unique_values_for_columns(group)

column: class_of_worker
list of unique values: ['not in universe' 'Other' 'private']
column: industry_code
list of unique values: [0 'Other' 33]
column: occupation_code
list of unique values: [0 'Other']
column: education
list of unique values: ['high school graduate' 'some college but no degree' 'Other' 'children'
 'bachelors degree(ba ab bs)']
column: enrolled_in_edu_inst_lastwk
list of unique values: ['not in universe' 'Other']
column: marital_status
list of unique values: ['widowed' 'divorced' 'never married' 'married-civilian spouse present'
 'Other']
column: major_industry_code
list of unique values: ['not in universe or children' 'Other' 'retail trade']
column: major_occupation_code
list of unique values: ['not in universe' 'precision production craft & repair'
 'professional specialty' 'executive admin and managerial' 'Other'
 'adm support including clerical' 'other service' 'sales']
column: race
list of unique values: ['white' 'Other' 'black']
column: hispanic_origin
list of u

In [None]:
group = encode_categorical_columns(group)

In [55]:
group.state_of_previous_residence.unique()

array([1, 2, 0], dtype=int64)

In [22]:
#oversample data to help with imbalance

#display imbalance of training data
print(Counter(df_train['income_level']))
train = remove_columns_with_null_values(
    scale_numeric_columns(
        encode_categorical_columns(convert_income_level(df_train))), 0.95)

Counter({-50000: 187141, 50000: 12382})


In [23]:
print(Counter(train['income_level']))

Counter({0: 187141, 1: 12382})


In [24]:
X_train,y_train = train.drop(['income_level'],axis=1),train['income_level']
X_resampled,y_resampled = SMOTE(random_state=0).fit_sample(X_train,y_train)

Unnamed: 0,age,class_of_worker,industry_code,occupation_code,education,wage_per_hour,enrolled_in_edu_inst_lastwk,marital_status,major_industry_code,major_occupation_code,...,country_father,country_mother,country_self,citizenship,business_or_self_employed,fill_questionnaire_veteran_admin,veterans_benefits,weeks_worked_in_year,year,income_level
199523,38,Private,6,36,1st 2nd 3rd or 4th grade,0,Not in universe,Married-civilian spouse present,Manufacturing-durable goods,Machine operators assmblrs & inspctrs,...,Mexico,Mexico,Mexico,Foreign born- Not a citizen of U S,0,Not in universe,2,12,95,-50000
199524,44,Self-employed-not incorporated,37,12,Associates degree-occup /vocational,0,Not in universe,Married-civilian spouse present,Business and repair services,Professional specialty,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,26,95,-50000
199525,2,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,95,-50000
199526,35,Private,29,3,High school graduate,0,Not in universe,Divorced,Transportation,Executive admin and managerial,...,United-States,United-States,United-States,Native- Born in the United States,2,Not in universe,2,52,94,-50000
199527,49,Private,4,34,High school graduate,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,50,95,-50000
199528,13,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,Germany,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,-50000
199529,1,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,Mexico,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,-50000
199530,61,Not in universe,0,0,High school graduate,0,Not in universe,Married-civilian spouse present,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,-50000
199531,38,Private,45,12,Masters degree(MA MS MEng MEd MSW MBA),0,Not in universe,Married-civilian spouse present,Other professional services,Professional specialty,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,-50000
199532,7,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,-50000
