In [1]:
import pandas as pd
import df_tools

df = pd.read_csv('data/train.csv')
X, y = df_tools.prepare_dataset(df)
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].astype(int)


array([[ 0., 39.,  0., ...,  0.,  0.,  0.],
       [ 0., 24.,  0., ...,  0.,  0.,  1.],
       [ 0., 58.,  1., ...,  0.,  0.,  1.],
       ...,
       [ 0., 26.,  0., ...,  1.,  0.,  1.],
       [ 0., 32.,  0., ...,  0.,  0.,  1.],
       [ 0., 44.,  0., ...,  0.,  0.,  1.]])

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1, stratify=y
)

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

log_reg_pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(C=10, solver='lbfgs', random_state=1)
)
log_reg_pipe.fit(X_train, y_train)

print('Log reg train score:', log_reg_pipe.score(X_train, y_train))
print('Log reg test score:', log_reg_pipe.score(X_test, y_test))

Log reg train score: 0.7238840312931432
Log reg test score: 0.7332106715731371


In [4]:
from sklearn.model_selection import GridSearchCV

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{'logisticregression__C': param_range}]
grid_search = GridSearchCV(
    estimator=log_reg_pipe,
    param_grid=param_grid,
    scoring='accuracy',
    cv=10,
    refit=True,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
print('best_score:', grid_search.best_score_)
print('best_params:', grid_search.best_params_)

best_score: 0.7203591925588759
best_params: {'logisticregression__C': 0.1}


In [5]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=500,
                                random_state=1)
forest.fit(X_train, y_train)

print('RandomForest train score:', forest.score(X_train, y_train))
print('RandomForest test score:', forest.score(X_test, y_test))

importances = forest.feature_importances_
print(importances)

final_columns = ['CryoSleep', 'Age', 'VIP', 'RoomService', 'GroupSize', 'HomePlanet_Europa', 'HomePlanet_Mars',
                 'Dest_PSO', 'Dest_TRAPPIST', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G',
                 'Deck_T', 'Side_S']

items = zip(final_columns, importances)
print('Column importances')
print(sorted(items, key=lambda v: v[1], reverse=True))

RandomForest train score: 0.9375671115201718
RandomForest test score: 0.7014719411223551
[1.81217061e-01 3.58742970e-01 5.78309089e-03 2.01594569e-01
 7.96608645e-02 1.88290178e-02 2.38635112e-02 1.03604426e-02
 2.20749549e-02 8.38150387e-03 8.85427330e-03 6.62428741e-03
 1.40033861e-02 1.59338934e-02 1.75892886e-02 2.08001916e-04
 2.62788831e-02]
Column importances
[('Age', 0.3587429704499815), ('RoomService', 0.2015945692044512), ('CryoSleep', 0.18121706071173097), ('GroupSize', 0.07966086449482429), ('Side_S', 0.02627888306371346), ('HomePlanet_Mars', 0.023863511179589885), ('Dest_TRAPPIST', 0.0220749549204146), ('HomePlanet_Europa', 0.018829017823327254), ('Deck_G', 0.01758928863718537), ('Deck_F', 0.01593389335196498), ('Deck_E', 0.014003386147106767), ('Dest_PSO', 0.010360442633452201), ('Deck_C', 0.00885427329643283), ('Deck_B', 0.008381503866714905), ('Deck_D', 0.006624287410552387), ('VIP', 0.005783090892352364), ('Deck_T', 0.00020800191620504987)]


In [6]:
df_test = pd.read_csv('data/test.csv')
# grid_search.predict

X_submit = df_tools.prepare_dataset(df_test, with_y=False)
y_submit = grid_search.predict(X_submit)
y_submit

answer_ser = pd.Series(y_submit, dtype='int32').astype(bool)
df_submit = pd.read_csv('data/sample_submission.csv')
df_submit['Transported'] = answer_ser
df_submit.to_csv('answer.csv', index=False)
df_submit

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].astype(int)


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,False
