In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import df_tools

df = pd.read_csv('data/train.csv')
X, y = df_tools.prepare_dataset(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['CryoSleep'] = df2['CryoSleep'].astype(int)


In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1, stratify=y
)

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

log_reg_pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(C=0.5, solver='lbfgs', random_state=1)
)
log_reg_pipe.fit(X_train, y_train)

print('Log reg train score:', log_reg_pipe.score(X_train, y_train))
print('Log reg test score:', log_reg_pipe.score(X_test, y_test))

Log reg train score: 0.723423838011965
Log reg test score: 0.7364305427782889


In [4]:
from sklearn.model_selection import GridSearchCV

param_range = [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 100.0, 1000.0]
param_grid = [{'logisticregression__C': param_range}]
grid_search = GridSearchCV(
    estimator=log_reg_pipe,
    param_grid=param_grid,
    scoring='accuracy',
    cv=10,
    refit=True,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
print('Grid search log reg best_score:', grid_search.best_score_)
print('Grid search log reg best_params:', grid_search.best_params_)

Grid search log reg best_score: 0.7226612196432105
Grid search log reg best_params: {'logisticregression__C': 0.5}


In [5]:
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint
forest = RandomForestClassifier(n_estimators=100,
                                random_state=1)
forest.fit(X_train, y_train)

print('RandomForest train score:', forest.score(X_train, y_train))
print('RandomForest test score:', forest.score(X_test, y_test))

importances = forest.feature_importances_
# print(importances)

final_columns = ['CryoSleep', 'Age', 'VIP', 'RoomService', 'GroupSize', 'HomePlanet_Europa', 'HomePlanet_Mars',
                 'Dest_PSO', 'Dest_TRAPPIST', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G',
                 'Deck_T', 'Side_S']

items = zip(final_columns, importances)
print('Column importances:')
pprint(sorted(items, key=lambda v: v[1], reverse=True))

RandomForest train score: 0.936953520478601
RandomForest test score: 0.7065317387304508
Column importances:
[('Age', 0.35880337865699186),
 ('VIP', 0.20297120002550653),
 ('CryoSleep', 0.18166280929986622),
 ('RoomService', 0.07968366393670168),
 ('Deck_T', 0.02612687543299052),
 ('Dest_PSO', 0.023259604198538897),
 ('HomePlanet_Europa', 0.02252613940564539),
 ('GroupSize', 0.019770107801728585),
 ('Deck_F', 0.018896458521944708),
 ('Deck_E', 0.01694595554342488),
 ('Deck_D', 0.014785696269549932),
 ('HomePlanet_Mars', 0.010719249258873378),
 ('Dest_TRAPPIST', 0.008229117663043859),
 ('Deck_B', 0.008067629416050541),
 ('Deck_C', 0.007343686910355404),
 ('Deck_G', 0.00020842765878762847)]


In [6]:
param_grid = [{'n_estimators': [100, 200, 300, 400, 500, 600]}]
grid_search = GridSearchCV(
    estimator=forest,
    param_grid=param_grid,
    scoring='accuracy',
    cv=10,
    refit=True,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
print('Grid search random forest best_score:', grid_search.best_score_)
print('Grid search random forest best_params:', grid_search.best_params_)

Grid search random forest best_score: 0.7065498572276724
Grid search random forest best_params: {'n_estimators': 100}


In [7]:
from sklearn.svm import SVC

svm_pipe = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', gamma=0.1, C=0.1, random_state=1)
)
svm_pipe.fit(X_train, y_train)

print('Rbf svm train score:', svm_pipe.score(X_train, y_train))
print('Rbf svm test score:', svm_pipe.score(X_test, y_test))

Rbf svm train score: 0.7455131155085136
Rbf svm test score: 0.7428702851885924


In [8]:
# svm_param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
# param_grid = [{'svc__C': svm_param_range, 'svc__gamma': svm_param_range, 'svc__kernel': ['rbf']}]
# grid_search = GridSearchCV(
#     estimator=svm_pipe,
#     param_grid=param_grid,
#     scoring='accuracy',
#     cv=10,
#     refit=True,
#     n_jobs=-1
# )

# grid_search.fit(X_train, y_train)
# print('Grid search Rbf svm best_score:', grid_search.best_score_)
# print('Grid search Rbf svm best_params:', grid_search.best_params_)

# best score 0.7415260147201568
# best params {'svc__C': 1.0, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}

In [9]:
from train_tools import MajorityVoteClassifier
majority_clf = MajorityVoteClassifier(classifiers=[forest, log_reg_pipe])

In [10]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
# y_fit_example = np.array([-1, -2, -3, -4, -5])
y_fit_example = np.array(['a', 'b', 'c', 'd', 'e'])
encoder.fit(y_fit_example)

y_pretransform_example = np.array(['a', 'a', 'c', 'c', 'b', 'd', 'e'])
print(f'before transform: {y_pretransform_example}')
y_transformed = encoder.transform(y_pretransform_example)
print(f'after transform: {y_transformed}')
mapping = {i: v for i, v in enumerate(encoder.classes_)}
print(f'mapping: {mapping}')

print(f'try inverse transform: {encoder.inverse_transform(y_transformed)}')

before transform: ['a' 'a' 'c' 'c' 'b' 'd' 'e']
after transform: [0 0 2 2 1 3 4]
mapping: {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e'}
try inverse transform: ['a' 'a' 'c' 'c' 'b' 'd' 'e']


In [24]:
from train_tools import MajorityVoteClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

kn_clf = KNeighborsClassifier(n_neighbors=1,
                              p=2,
                              metric='minkowski')
tree_clf = DecisionTreeClassifier(max_depth=1,
                                  criterion='entropy',
                                  random_state=0)

# ERROR when using svm_pipe
# maj_clf = MajorityVoteClassifier([forest, log_reg_pipe, svm_pipe])
maj_clf = MajorityVoteClassifier([tree_clf, log_reg_pipe, kn_clf])
# maj_clf.get_params()

In [25]:
from sklearn.model_selection import cross_val_score

# all_clf = [forest, log_reg_pipe, svm_pipe, maj_clf]
# clf_labels = ['Random forest', 'Log reg', 'SVM', 'Majoritary']

all_clf = [tree_clf, log_reg_pipe, kn_clf, maj_clf]
clf_labels = ['Tree', 'Log reg', 'KNeigh', 'Majoritary']
for clf, label in zip(all_clf, clf_labels):
    # scoring='roc_auc'
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')
    print(f'score: {scores.mean()} / std: {scores.std()} / {label}')

score: 0.7185294343906976 / std: 0.02155674752669875 / Tree
score: 0.7925879846232414 / std: 0.02517204484267896 / Log reg
score: 0.6491123317465302 / std: 0.01698809256861472 / KNeigh
score: 0.7726387200254241 / std: 0.023096639285711703 / Majoritary


In [14]:
df_test = pd.read_csv('data/test.csv')
# grid_search.predict

X_submit = df_tools.prepare_dataset(df_test, with_y=False)
y_submit = svm_pipe.predict(X_submit)
y_submit

answer_ser = pd.Series(y_submit, dtype='int32').astype(bool)
df_submit = pd.read_csv('data/sample_submission.csv')
df_submit['Transported'] = answer_ser

# df_submit.to_csv('answer.csv', index=False)
# df_submit

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['CryoSleep'] = df2['CryoSleep'].astype(int)


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,False
