# Two Percent

For the following models, I used a dataset of 5,000 rows, and 5,000 columns. This is equivalent to 2% (one percent) of my initial dataset of 220,000 rows and 5,000 columns.

In [1]:
from os import chdir
chdir('/Users/mandymoody/DSI/projects/project_3')

%matplotlib inline

from lib import *

In [2]:
from datetime import datetime
from IPython.display import display
from sklearn.decomposition import PCA 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 
from sklearn.feature_selection import SelectKBest, SelectFromModel, f_classif
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from scipy import stats, mean 

In [3]:
feat5000_df = pd.read_pickle('5000feature_df.p')
target5000_df = pd.read_pickle('5000target_df.p')

In [4]:
X = feat5000_df.values
y = target5000_df.values

In [5]:
fourth_pipeline = Pipeline([
    ('pca', PCA()),
    ('skb', SelectKBest(k=40)),
    ('rf', RandomForestClassifier())
])

In [6]:
fourth_pipe_params = {
    'rf__n_estimators':[10,40,100],
    'rf__max_depth':[10,40,None]
}

In [7]:
fourth_pipeline_gs = GridSearchCV(fourth_pipeline,
                                 param_grid=fourth_pipe_params, 
                                 n_jobs=-1,
                                 cv=ShuffleSplit(random_state=42)                               
)

In [8]:
fourth_start_time = datetime.now()
fourth_pipeline_gs.fit(X, y.ravel())
fourth_end_time = datetime.now()

In [10]:
fourth_pipeline_total_time = fourth_end_time - fourth_start_time
total_minutes_5k_fourth_pipeline = (fourth_pipeline_total_time.total_seconds()) / 60
total_minutes_5k_fourth_pipeline

101.83026131666666

In [11]:
fourth_pipeline_results = pd.DataFrame(fourth_pipeline_gs.cv_results_)
fourth_pipeline_results.T

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,267.518,264.802,265.365,266.342,255.424,247.295,240.082,237.822,215.551
mean_score_time,1.37939,1.4811,1.41051,1.34076,1.26019,1.34118,1.38229,1.37041,1.08613
mean_test_score,0.5034,0.494,0.491,0.5074,0.499,0.5038,0.5008,0.4974,0.5054
mean_train_score,0.898689,0.979044,0.992311,0.9868,1,1,0.986089,1,1
param_rf__max_depth,10,10,10,40,40,40,,,
param_rf__n_estimators,10,40,100,10,40,100,10,40,100
params,"{'rf__max_depth': 10, 'rf__n_estimators': 10}","{'rf__max_depth': 10, 'rf__n_estimators': 40}","{'rf__max_depth': 10, 'rf__n_estimators': 100}","{'rf__max_depth': 40, 'rf__n_estimators': 10}","{'rf__max_depth': 40, 'rf__n_estimators': 40}","{'rf__max_depth': 40, 'rf__n_estimators': 100}","{'rf__max_depth': None, 'rf__n_estimators': 10}","{'rf__max_depth': None, 'rf__n_estimators': 40}","{'rf__max_depth': None, 'rf__n_estimators': 100}"
rank_test_score,4,8,9,1,6,3,5,7,2
split0_test_score,0.528,0.54,0.48,0.506,0.512,0.506,0.506,0.54,0.528
split0_train_score,0.917111,0.980222,0.994,0.982889,1,1,0.985111,1,1


In [12]:
fourth_pipeline_results.to_pickle('fourth_pipeline_results5k.p')

In [21]:
fifth_pipeline = Pipeline([
    ('pca', PCA()),
    ('sfm', SelectFromModel(Lasso())),
    ('knc', KNeighborsClassifier()) 
])

In [22]:
fifth_pipe_params = {
    'knc__n_neighbors':[5,7],
    'knc__n_jobs':[-1]
}

In [23]:
fifth_pipeline_gs = GridSearchCV(fifth_pipeline,
                                 param_grid=fifth_pipe_params, 
                                 n_jobs=-1,
                                 cv=ShuffleSplit(random_state=42)                               
)

In [None]:
fifth_pipeline_start_time = datetime.now()
fifth_pipeline_gs.fit(X, y.ravel())
fifth_pipeline_end_time = datetime.now()

In [None]:
fifth_pipeline_total_time = fifth_pipeline_end_time - fifth_pipeline_start_time
total_minutes_5k_fifth_pipeline = (fifth_pipeline_total_time.total_seconds()) / 60
total_minutes_5k_fifth_pipeline

In [None]:
fifth_pipeline_results = pd.DataFrame(fifth_pipeline_gs.cv_results_)
fifth_pipeline_results.T

In [None]:
fifth_pipeline_results.to_pickle('fifth_pipeline_results5k.p')

In [13]:
sixth_pipeline = Pipeline([
    ('pca', PCA()),
    ('skb', SelectKBest(k=40)),
    ('et', ExtraTreesClassifier())
])

In [14]:
sixth_pipe_params = {
    'et__n_estimators':[10,40,100],
    'et__max_depth':[10,40,None]
}

In [16]:
sixth_pipeline_gs = GridSearchCV(sixth_pipeline,
                                 param_grid=sixth_pipe_params, 
                                 n_jobs=-1,
                                 cv=ShuffleSplit(random_state=42)                               
)

In [17]:
sixth_pipeline_start_time = datetime.now()
sixth_pipeline_gs.fit(X, y.ravel())
sixth_pipeline_end_time = datetime.now()

In [18]:
sixth_pipeline_total_time = sixth_pipeline_end_time - sixth_pipeline_start_time
total_minutes_5k_sixth_pipeline = (sixth_pipeline_total_time.total_seconds()) / 60
total_minutes_5k_sixth_pipeline

99.04827135000001

In [19]:
sixth_pipeline_results = pd.DataFrame(sixth_pipeline_gs.cv_results_)
sixth_pipeline_results.T

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,257.718,253.801,254.295,249.405,237.746,244.388,238.931,238.948,216.154
mean_score_time,1.26871,1.36062,1.25249,1.31514,1.08543,1.28147,1.12822,1.24182,1.04191
mean_test_score,0.4888,0.4964,0.4956,0.5004,0.4926,0.5056,0.493,0.4932,0.4978
mean_train_score,0.877422,0.959067,0.978667,1,1,1,1,1,1
param_et__max_depth,10,10,10,40,40,40,,,
param_et__n_estimators,10,40,100,10,40,100,10,40,100
params,"{'et__max_depth': 10, 'et__n_estimators': 10}","{'et__max_depth': 10, 'et__n_estimators': 40}","{'et__max_depth': 10, 'et__n_estimators': 100}","{'et__max_depth': 40, 'et__n_estimators': 10}","{'et__max_depth': 40, 'et__n_estimators': 40}","{'et__max_depth': 40, 'et__n_estimators': 100}","{'et__max_depth': None, 'et__n_estimators': 10}","{'et__max_depth': None, 'et__n_estimators': 40}","{'et__max_depth': None, 'et__n_estimators': 100}"
rank_test_score,9,4,5,2,8,1,7,6,3
split0_test_score,0.504,0.508,0.528,0.484,0.484,0.506,0.448,0.482,0.464
split0_train_score,0.880222,0.959778,0.98,1,1,1,1,1,1


In [20]:
sixth_pipeline_results.to_pickle('sixth_pipeline_results5k.p')