In [165]:
import pandas as pd
import numpy as np
import copy, time
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split, KFold, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from mlxtend.regressor import StackingRegressor
import matplotlib.pyplot as plt

# Read data
data_path = 'data/'
df_train = pd.read_csv(data_path + 'train_data.csv')
df_test = pd.read_csv(data_path + 'test_features.csv')

pd.option_context('display.max_rows', None, 'display.max_columns', None)

<pandas.core.config.option_context at 0x1680c483e80>

In [166]:
# change 'True/False' of poi to '1/0'
train_yf = df_train['poi'].copy()
train_yf[train_yf == True] = 1
train_yf[train_yf == False] = 0

# process the data
train_y = df_train['poi']
ids = df_test['name']
df_train = df_train.drop(['name', 'poi'], axis=1)
df_test = df_test.drop(['name'], axis=1)
df_ori = pd.concat([df_train, df_test])
df_ori.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,1750000.0,,-3504386.0,,ken.rice@enron.com,19794175.0,46950.0,18.0,42.0,4.0,,1617011.0,174839.0,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,5600000.0,,,,jeff.skilling@enron.com,19250000.0,29336.0,108.0,88.0,30.0,,1920000.0,22122.0,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,200000.0,,-4167.0,,rex.shelby@enron.com,1624396.0,22884.0,39.0,13.0,14.0,,,1573324.0,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,800000.0,,,,michael.kopper@enron.com,,118134.0,,,,,602671.0,907502.0,985032.0,,224305.0,,,2652612.0,985032.0
4,1250000.0,,-262500.0,,christopher.calger@enron.com,,35818.0,144.0,199.0,25.0,,375304.0,486.0,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0


In [167]:
# remove some columns with too much NaN values
df_useful_features = df_ori.drop(['deferral_payments', 'director_fees', 'loan_advances', 'restricted_stock_deferred'], axis=1)

# remove email, because it is just identical to each person
df_useful_features = df_useful_features.drop(['email_address'], axis=1)

df_useful_features.head()

Unnamed: 0,bonus,deferred_income,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,long_term_incentive,other,restricted_stock,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,1750000.0,-3504386.0,19794175.0,46950.0,18.0,42.0,4.0,1617011.0,174839.0,2748364.0,420636.0,864.0,905.0,505050.0,22542539.0
1,5600000.0,,19250000.0,29336.0,108.0,88.0,30.0,1920000.0,22122.0,6843672.0,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,200000.0,-4167.0,1624396.0,22884.0,39.0,13.0,14.0,,1573324.0,869220.0,211844.0,91.0,225.0,2003885.0,2493616.0
3,800000.0,,,118134.0,,,,602671.0,907502.0,985032.0,224305.0,,,2652612.0,985032.0
4,1250000.0,-262500.0,,35818.0,144.0,199.0,25.0,375304.0,486.0,126027.0,240189.0,2188.0,2598.0,1639297.0,126027.0


In [168]:
def fill_na(data, column_name, fill_method, default_fill_value):
    is_null = data[column_name].isnull()
    is_non_null = ~is_null
    if fill_method == 'mean':
        fill_value = data[column_name][is_non_null].mean()
    elif fill_method == 'mode':
        fill_value = data[column_name][is_non_null].mode()
    elif fill_method == 'median':
        fill_value = data[column_name][is_non_null].median()
    else:
        fill_value = default_fill_value

    data[column_name][is_null] = fill_value
    return data

# Observe the data
df = df_useful_features.copy()
columns_fill_with_mean = ['bonus', 'deferred_income', 'exercised_stock_options', 'expenses', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'long_term_incentive', 'other', 'restricted_stock', 'salary', 'shared_receipt_with_poi', 'to_messages', 'total_payments', 'total_stock_value']
for column_name in columns_fill_with_mean:
    df = fill_na(df, column_name, 'mean', 10000)

df.head()



Unnamed: 0,bonus,deferred_income,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,long_term_incentive,other,restricted_stock,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,1750000.0,-3504386.0,19794180.0,46950.0,18.0,42.0,4.0,1617011.0,174839.0,2748364.0,420636.0,864.0,905.0,505050.0,22542539.0
1,5600000.0,-1140475.0,19250000.0,29336.0,108.0,88.0,30.0,1920000.0,22122.0,6843672.0,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,200000.0,-4167.0,1624396.0,22884.0,39.0,13.0,14.0,1470361.0,1573324.0,869220.0,211844.0,91.0,225.0,2003885.0,2493616.0
3,800000.0,-1140475.0,5987054.0,118134.0,608.790698,64.895349,41.232558,602671.0,907502.0,985032.0,224305.0,1176.465116,2073.860465,2652612.0,985032.0
4,1250000.0,-262500.0,5987054.0,35818.0,144.0,199.0,25.0,375304.0,486.0,126027.0,240189.0,2188.0,2598.0,1639297.0,126027.0


In [169]:
def log_value(x):
    if x > 0:
        return np.log(x)
    
    elif x < 0:
        return np.log(-x)
    
    else:
        return 0

# normalize the value by log
columns_to_be_logged = ['bonus', 'deferred_income', 'exercised_stock_options', 'expenses', 'long_term_incentive', 'other', 'restricted_stock', 'total_payments', 'total_stock_value', 'salary']
for column_name in columns_to_be_logged:
    df[column_name] = df[column_name].map(log_value)
    
df.head()

Unnamed: 0,bonus,deferred_income,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,long_term_incentive,other,restricted_stock,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,14.375126,15.069526,16.800898,10.756838,18.0,42.0,4.0,14.29609,12.071621,14.826516,12.949523,864.0,905.0,13.132413,16.930915
1,15.538277,13.946956,16.773022,10.286571,108.0,88.0,30.0,14.467836,10.004328,15.738835,13.921003,2042.0,3627.0,15.976845,17.077203
2,12.206073,8.334952,14.300647,10.038193,39.0,13.0,14.0,14.201019,14.268701,13.675352,12.263605,91.0,225.0,14.510598,14.729244
3,13.592367,13.946956,15.60511,11.679575,608.790698,64.895349,41.232558,13.309127,13.718451,13.800429,12.320762,1176.465116,2073.860465,14.791055,13.800429
4,14.038654,12.478006,15.60511,10.486206,144.0,199.0,25.0,12.835492,6.186209,11.744251,12.389181,2188.0,2598.0,14.309778,11.744251


In [180]:
# normalize into [0, 1]
df = MinMaxScaler().fit_transform(df)

# extract train data
train_num = train_y.shape[0]
train_x = df[:train_num]
test_x = df[train_num:]

In [181]:
# optimize parameters
#gb_tol = np.arange(0.05, 100, 0.05)
#gb_subsample = np.arange(0.01, 1, 0.1)
gb_n_estimators = list(range(5, 500, 5))
gb_max_features = list(range(5, df.shape[1]))
gb_max_depth = list(range(1, 30))
gb_param_search = dict(#tol = gb_tol,
                       #subsample = gb_subsample,
                       n_estimators = gb_n_estimators,
                       max_features = gb_max_features,
                       max_depth = gb_max_depth)
gb_rough = GradientBoostingClassifier(learning_rate = 0.03)
gb_rand_search = RandomizedSearchCV(gb_rough, gb_param_search, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
gb_search_result = gb_rand_search.fit(train_x, train_yf)

gb_search_result.best_params_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:    0.4s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.8s finished


{'n_estimators': 140, 'max_features': 6, 'max_depth': 26}

In [182]:
# optimize parameters
rt_n_estimators = list(range(5, 500, 5))
rt_min_samples_split = list(range(2, 20, 1))
rt_min_samples_leaf = list(range(1, 10, 1))
rt_max_depth = list(range(1, 30))
rt_param_search = dict(n_estimators = rt_n_estimators,
                       min_samples_split = rt_min_samples_split,
                       min_samples_leaf = rt_min_samples_leaf,
                       max_depth = rt_max_depth)
rt_rough = RandomForestClassifier(max_features='sqrt', bootstrap=True)
rt_rand_search = RandomizedSearchCV(rt_rough, rt_param_search, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
rt_search_result = rt_rand_search.fit(train_x, train_yf)

rt_search_result.best_params_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:    1.1s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    3.6s finished


{'n_estimators': 20,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_depth': 29}

In [183]:
# Use 3 models
lr = LogisticRegression(tol=0.001, penalty='l2', fit_intercept=True, C=1.0)
gb = GradientBoostingClassifier(tol=100, subsample=0.75, 
                                n_estimators=gb_search_result.best_params_['n_estimators'], 
                                max_features=gb_search_result.best_params_['max_features'],
                                max_depth=gb_search_result.best_params_['max_depth'], 
                                learning_rate=0.03)
rt = RandomForestClassifier(n_estimators=rt_search_result.best_params_['n_estimators'], 
                            min_samples_split=rt_search_result.best_params_['min_samples_split'], 
                            min_samples_leaf=rt_search_result.best_params_['min_samples_leaf'],
                            max_features='sqrt', 
                            max_depth=rt_search_result.best_params_['max_depth'], 
                            bootstrap=True)

In [174]:
def save_to_csv(data, filename):
    sub = pd.DataFrame({'name': ids, 'poi': data})
    sub.to_csv(filename, index=False)

lr.fit(train_x, train_yf)
lr_pred = lr.predict_proba(test_x)[:, 1]
save_to_csv(lr_pred, 'hw51_lr.csv')

lr_pred

array([0.61689061, 0.7892687 , 0.17414026, 0.07985916, 0.63609373,
       0.25234816, 0.84976003, 0.08803451, 0.15199535, 0.31928302,
       0.00423081, 0.04984163, 0.0918091 , 0.11870912, 0.03212065,
       0.03253397, 0.05460969, 0.08237625, 0.03562162, 0.01298197,
       0.04905691, 0.27732977, 0.12773352, 0.04927717, 0.03477264,
       0.56569547, 0.09377833, 0.01556615, 0.21675724, 0.34058206,
       0.08354346, 0.11940446, 0.30776651])

In [175]:
gb.fit(train_x, train_yf)
gb_pred = gb.predict_proba(test_x)[:, 1]
save_to_csv(gb_pred, 'hw51_gb.csv')

gb_pred

array([0.0941783 , 0.24285288, 0.11281269, 0.02960872, 0.08894003,
       0.1598192 , 0.46974242, 0.04022319, 0.05066628, 0.07907902,
       0.02976728, 0.02975781, 0.02975776, 0.02974791, 0.25477961,
       0.0297139 , 0.03189234, 0.06254215, 0.18544383, 0.029771  ,
       0.60017721, 0.02969976, 0.03283328, 0.0329136 , 0.02967993,
       0.22280957, 0.03190713, 0.02975523, 0.03364756, 0.08469447,
       0.02972444, 0.04475622, 0.11894848])

In [185]:
rt.fit(train_x, train_yf)
rt_pred = rt.predict_proba(test_x)[:, 1]
save_to_csv(rt_pred, 'hw51_rt.csv')

rt_pred

array([0.22912698, 0.17256591, 0.2790522 , 0.09801587, 0.2228491 ,
       0.15123834, 0.71892857, 0.07439394, 0.14503968, 0.23174603,
       0.        , 0.        , 0.04375   , 0.05196429, 0.44561966,
       0.06060606, 0.04109848, 0.12180556, 0.30148185, 0.        ,
       0.83071429, 0.07388889, 0.06140249, 0.09166667, 0.08052947,
       0.22943099, 0.11944444, 0.        , 0.11486111, 0.23190476,
       0.04333333, 0.11190476, 0.19503968])

In [177]:
from mlxtend.classifier import StackingClassifier

# optimize parameters
mt_n_estimators = list(range(5, 500, 5))
mt_max_features = list(range(5, df.shape[1]))
mt_max_depth = list(range(1, 30))
mt_param_search = dict(n_estimators = mt_n_estimators,
                       max_features = mt_max_features,
                       max_depth = mt_max_depth)
mt_rough = GradientBoostingClassifier(learning_rate = 0.03)
st_rough = StackingClassifier(classifiers=[lr, gb, rt],
                             use_probas=True,
                             meta_classifier=mt_rough)

mt_rand_search = RandomizedSearchCV(mt_rough, mt_param_search, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
mt_search_result = mt_rand_search.fit(train_x, train_yf)

mt_search_result.best_params_



Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:    0.4s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.1s finished


{'n_estimators': 20, 'max_features': 9, 'max_depth': 7}

In [178]:
mt = GradientBoostingClassifier(tol=100, subsample=0.75, 
                                n_estimators=mt_search_result.best_params_['n_estimators'], 
                                max_features=mt_search_result.best_params_['max_features'],
                                max_depth=mt_search_result.best_params_['max_depth'], 
                                learning_rate=0.03)
mt_lr = LogisticRegression(tol=0.001, penalty='l2', fit_intercept=True, C=1.0)
st = StackingClassifier(classifiers=[gb, rt],
                        use_probas=True,
                       meta_classifier=mt_lr)

In [179]:
st.fit(train_x, train_yf)
st_pred = st.predict_proba(test_x)[:, 1]
save_to_csv(st_pred, 'hw51_st.csv')

st_pred

array([0.07280259, 0.15732962, 0.09087303, 0.03103382, 0.10887683,
       0.09550203, 0.72150096, 0.03251503, 0.04291875, 0.0528368 ,
       0.03051211, 0.02741068, 0.02974864, 0.02836425, 0.21395755,
       0.0306062 , 0.03251271, 0.04461207, 0.14720438, 0.02740965,
       0.84701625, 0.03799772, 0.03250016, 0.03957339, 0.03295803,
       0.14349805, 0.04279965, 0.02866269, 0.03912035, 0.05189215,
       0.02947978, 0.03306831, 0.06511197])