In [2]:
colab = True
if colab:
    import sys
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    sys.path.append('drive/My Drive/Colab Notebooks/Income_prediction')
    prefix = 'drive/My Drive/Colab Notebooks/Income_prediction/'
else:
    prefix = ''

Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np
import time
from sklearn import model_selection
from sklearn import preprocessing
import seaborn as sns
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from scipy.stats import mode
from imblearn.over_sampling import SMOTE
import re


# Preprocessing

### Get raw data

In [0]:
def get_data(url):
    columns = ['age',
       'class of worker',
       'detailed industry recode',
       'detailed occupation recode',
       'education',
       'wage per hour',
       'enroll in edu inst last wk',
       'marital status',
       'major industry code',
       'major occupation code',
       'race',
       'hispanic origin',
       'sex',
       'member of a labor union',
       'reason for unemployment',
       'full or part time employment stat',
       'capital gains',
       'capital losses',
       'dividends from stocks',
       'tax filer stat',
       'region of previous residence',
       'state of previous residence',
       'detailed household and family stat',
       'detailed household summary in household',
       'instance weight',
       'migration code-change in msa',
       'migration code-change in reg',
       'migration code-move within reg',
       'live in this house 1 year ago',
       'migration prev res in sunbelt',
       'num persons worked for employer',
       'family members under 18',
       'country of birth father',
       'country of birth mother',
       'country of birth self',
       'citizenship',
       'own business or self employed',
       'fill inc questionnaire for veterans admin',
       'veterans benefits',
       'weeks worked in year',
       'year',
       'class'
       ]
    data=pd.read_csv(url, names = columns, na_values=' ?')
    return data

In [0]:
url = prefix+'census/census-income.data'
raw_data = get_data(url)

### EDA

In [0]:
pd.options.display.max_columns = None
raw_data.head(300)

In [0]:
raw_data.info()

In [0]:
raw_data.groupby('class').count()

##### Imbalanced dataset => Need to decide weather we should go for re-sampling techniques or not (after performance evaluation)

### Outlier detection 

In [0]:
raw_data.describe()

In [0]:
sns.set(style="whitegrid")
ax = sns.boxplot(y=raw_data['age'])

In [0]:
raw_data.select_dtypes(exclude = 'O').plot(kind = 'box', figsize = (50,8))

### Find missing values

In [0]:
contain_null = np.array(raw_data.isnull().sum().to_frame()[raw_data.isnull().sum().to_frame()[0]!=0].index)

There are 8 features having missing values: 
- state of previous residence
- migration code-change in msa 
- migration code-change in reg 
- migration code-move within reg
- migration prev res in sunbelt 
- country of birth father
- country of birth mother
- country of birth self


### Missing values imputation

In [0]:
for col in raw_data.select_dtypes('O').columns:
    raw_data[col] = raw_data[col].astype('category')

In [0]:
def OnehotEncode(data, categorical_columns):
    df_1 = data.drop(columns = categorical_columns, axis = 1)
    df_2 = pd.get_dummies(data[categorical_columns])
    df = pd.concat([df_1, df_2], axis=1, join='inner')
    return df

In [0]:
def ImputeVoteClassifier(OnehotEncode, data, target_name):
    print('*'*100+'\n')
    print('Start imputing missing values for feature: {} \n'.format(target_name))
    # Training set
    print('Generating training set...')
    train_data = data[data[target_name].notnull()].copy()
    train_target = train_data[target_name]
    train_data.drop(columns = [target_name], inplace = True)
    encoded_train = OnehotEncode(train_data, train_data.select_dtypes('category').columns)
    print('Done generating training set \n')
    # Testing set
    print('Generating testing set...')
    test_data = data[data[target_name].isnull()].copy()
    test_target = test_data[target_name]
    # Drop target var in testing set
    test_data.drop(columns = [target_name], inplace = True)
    encoded_test = OnehotEncode(test_data, test_data.select_dtypes('category').columns)
    print('Done generating testing set \n')
    # Fit data into base classifiers
    etc = ExtraTreeClassifier()
    print('Fitting data into {}...'.format(etc.__class__.__name__))
    etc.fit(encoded_train, train_target)
    etc_pred = etc.predict(encoded_test)

    dtc = DecisionTreeClassifier()
    print('Fitting data into {}...'.format(dtc.__class__.__name__))
    dtc.fit(encoded_train, train_target)
    dtc_pred = dtc.predict(encoded_test)

    rfc = RandomForestClassifier()
    print('Start fitting data into {}...'.format(rfc.__class__.__name__))
    rfc.fit(encoded_train, train_target)
    rfc_pred = rfc.predict(encoded_test)
    
    # Finalize data
    print('Voting final predictions...')
    final_pred = np.array([])
    for i in range(0,len(test_target)):
        final_pred = np.append(final_pred, mode([etc_pred[i], dtc_pred[i], rfc_pred[i]])[0])
    print('Done voting and dump final predictions into feature: {}'.format(target_name))
    print('\n'+'*'*100)
    return final_pred


In [73]:
for f in contain_null:
    raw_data.loc[(raw_data[f].isnull()),f] = ImputeVoteClassifier(OnehotEncode, raw_data, f)

****************************************************************************************************

Start imputing missing values for feature: migration code-change in msa 

Generating training set...
Done generating training set 

Generating testing set...
Done generating testing set 

Fitting data into ExtraTreeClassifier...
Fitting data into DecisionTreeClassifier...
Start fitting data into RandomForestClassifier...
Voting final predictions...
Done voting and dump final predictions into feature: migration code-change in msa

****************************************************************************************************
****************************************************************************************************

Start imputing missing values for feature: migration code-change in reg 

Generating training set...
Done generating training set 

Generating testing set...
Done generating testing set 

Fitting data into ExtraTreeClassifier...
Fitting data into DecisionTreeCl

### Model development

In [0]:
X, y = raw_data.iloc[:,1:-1],raw_data.iloc[:,-1]
X = OnehotEncode(X, X.select_dtypes('category').columns)
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X.columns.values]

In [0]:
def split_data(X,y, seed, re=False):
    # Train-Test split
    test_size = 0.3
    X_train_o, X_test, y_train_o, y_test = model_selection.train_test_split(X, y, test_size=test_size, random_state=seed)
    # Resampling
    if re:
        resam=SMOTE(random_state=seed)
        resam.fit(X_train_o, y_train_o)
        X_train, y_train = resam.fit_resample(X_train_o, y_train_o)
        X_train = pd.DataFrame(X_train, columns=X_train_o.columns)
        y_train = pd.Series(y_train)
    else:
        X_train, y_train = X_train_o,y_train_o
    return X_train, y_train, X_test, y_test
X_train, y_train, X_test, y_test = split_data(X, y, seed=1003, re=False)


In [0]:
model = xgb.XGBClassifier(
                    subsample= 0.8, 
                    silent= 1, 
                    seed= 50, 
                    reg_lambda= 40, 
                    reg_alpha= 10, 
                    objective= 'binary:logistic', 
                    n_estimators= 1024, 
                    min_child_weight= 15, 
                    max_depth= 4, 
                    learning_rate= 0.05, 
                    gamma= 0.8, 
                    colsample_bytree= 0.4, 
                    class_weight= 'd',
                      verbose=2
                      )

In [0]:
eval_set = [(X_train, y_train), (X_test, y_test)]
eval_metric = [
            #    "logloss",
               "auc"
#                "aucpr",
#                "error",
              ]
model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=True,early_stopping_rounds=15)