In [255]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [256]:
df = pd.read_csv('data/train.csv')
display(df)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [257]:
# with pd.option_context('display.max_rows', None):
#     display(df)
    # df_transported_true = df[df['Transported'] == True]
    # display(df_transported_true)
# pd.reset_option('display.max_columns', None)

In [258]:
####### Number of missing data ########

df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [259]:
######## Type of data #########

df.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [260]:
######## number of unique values ##########

df.nunique()

PassengerId     8693
HomePlanet         3
CryoSleep          2
Cabin           6560
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Name            8473
Transported        2
dtype: int64

In [261]:
def encode(original_df):
    df_to_return = original_df.copy()
    df_to_return[['Deck', 'num', 'side']] = original_df['Cabin'].str.split('/', expand=True)
    df_to_return = df_to_return.drop(columns='Cabin')

    age_bins = [0, 13, 25, 44, 59, 100]
    age_labels = ["0-13", "14-25", "26-44", "45-59", "60+"]
    df_to_return['Age'] = pd.cut(original_df['Age'], bins=age_bins, labels=age_labels)

    df_to_return['LastName'] = original_df['Name'].str.split(',').str[1]
    df_to_return = df_to_return.drop(columns='Name')

    column_to_encode = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'side', 'Age', 'LastName']

    enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    encoded_data = enc.fit_transform(df_to_return[column_to_encode])

    enc_df = pd.DataFrame(
        encoded_data,
        columns=enc.get_feature_names_out(column_to_encode)
    )

    df_to_return = pd.concat([df_to_return.drop(columns=column_to_encode), enc_df], axis=1)

    df_to_return['Group'] = original_df['PassengerId'].str.split('_').str[0]
    df_to_return['Id'] = original_df['PassengerId'].str.split('_').str[1]

    df_to_return = df_to_return.drop(columns='PassengerId')

    df_to_return['Group'] = df_to_return['Group'].astype(int)
    df_to_return['Id'] = df_to_return['Id'].astype(int)
    df_to_return['num'] = df_to_return['num'].astype(float)

    return df_to_return

In [262]:
from sklearn.model_selection import train_test_split

df_to_model = encode(df)

df_y = df_to_model['Transported']
df_x = df_to_model.drop(columns=['Transported'])

train_x, val_x, train_y, val_y = train_test_split(df_x,df_y, test_size=0.3, random_state=42, shuffle=True)

In [263]:
test_data = pd.read_csv('data/test.csv')
test_data_to_predict = encode(test_data)

In [264]:
model = RandomForestClassifier(n_estimators=500, random_state=42)
model.fit(df_x, df_y)
prediction = model.predict(test_data_to_predict)

id = test_data['PassengerId']

# write passengerid and predicition to submission.csv
submission = pd.DataFrame({
    'PassengerId': id,
    'Transported': prediction
})
submission.to_csv('submission.csv', index=False)

In [265]:
gbr = make_pipeline(KNNImputer(), GradientBoostingClassifier(random_state = 42))
# gbr = GradientBoostingClassifier(random_state = 42)
  
# Fit to training set
gbr.fit(df_x, df_y)
pred = gbr.predict(test_data_to_predict)

id = test_data['PassengerId']

submission_gradientBoost = pd.DataFrame({
    'PassengerId': id,
    'Transported': pred
})
submission_gradientBoost.to_csv('submission_GB.csv', index=False)

In [266]:
xgb = XGBClassifier(n_estimators=500, random_state = 42)
xgb.fit(df_x,df_y)
pred = xgb.predict(test_data_to_predict)
prediction_XGB = xgb.predict(test_data_to_predict)

id = test_data['PassengerId']

submission_XGradientBoost = pd.DataFrame({
    'PassengerId': id,
    'Transported': pred
})
submission_XGradientBoost.replace({1: True, 0: False}, inplace=True)
submission_XGradientBoost.to_csv('submission_XGB.csv', index=False)

  submission_XGradientBoost.replace({1: True, 0: False}, inplace=True)


In [None]:
param_grid = {
    'n_estimators': [50, 100, 150, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 1.0],
}

# Sett opp GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='accuracy',  # Evalueringsmetode
    cv=3,  # K-fold cross-validation
    verbose=1,
    n_jobs=-1,  # Bruk alle prosessorer
)

grid_search.fit(df_x, df_y)
print(f'best params: {grid_search.best_params_}')

best_xgb = grid_search.best_estimator_
best_xgb.fit(df_x, df_y)
pred = best_xgb.predict(test_data_to_predict)

submission_XGradientBoost_cv = pd.DataFrame({
    'PassengerId': id,
    'Transported': pred
})
submission_XGradientBoost_cv.replace({1: True, 0: False}, inplace=True)
submission_XGradientBoost_cv.to_csv('submission_XGB.csv', index=False)


TypeError: GridSearchCV.__init__() got an unexpected keyword argument 'random_state'

In [268]:



pipeline = make_pipeline(KNNImputer(), RandomForestClassifier(n_estimators=500, random_state=42))
pipeline.fit(train_x, train_y)

rf_pred = pipeline.predict(val_x)
rf_accuracy = accuracy_score(val_y, rf_pred)
print(rf_accuracy)

model = RandomForestClassifier(n_estimators=500, random_state=42)
model.fit(train_x, train_y)
prediction = model.predict(val_x)
print(accuracy_score(val_y, prediction))



0.7963957055214724
0.7983128834355828


In [269]:
from sklearn.decomposition import PCA


pca = make_pipeline(KNNImputer(),PCA(n_components=20, random_state=42))
pca.fit(train_x)
pca_train = pca.transform(train_x)
pca_val = pca.transform(val_x)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(pca_train, train_y)
rf_pred = rf.predict(pca_val)
print(accuracy_score(val_y, rf_pred))

0.7914110429447853


In [270]:
from sklearn.ensemble import GradientBoostingClassifier
gbr = make_pipeline(KNNImputer(),GradientBoostingClassifier(random_state = 42))
  
# Fit to training set
gbr.fit(train_x, train_y)
pred = gbr.predict(val_x)
accuracy = accuracy_score(val_y, pred)
print(accuracy)

0.7994631901840491


In [271]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(train_x,train_y)
pred = xgb.predict(val_x)
print(accuracy_score(val_y, pred))

0.7975460122699386


In [272]:
with pd.option_context('display.max_rows', None):
    display(train_x.dtypes)

RoomService                  float64
FoodCourt                    float64
ShoppingMall                 float64
Spa                          float64
VRDeck                       float64
num                          float64
HomePlanet_Earth             float64
HomePlanet_Europa            float64
HomePlanet_Mars              float64
HomePlanet_nan               float64
CryoSleep_False              float64
CryoSleep_True               float64
CryoSleep_nan                float64
Destination_55 Cancri e      float64
Destination_PSO J318.5-22    float64
Destination_TRAPPIST-1e      float64
Destination_nan              float64
VIP_False                    float64
VIP_True                     float64
VIP_nan                      float64
Deck_A                       float64
Deck_B                       float64
Deck_C                       float64
Deck_D                       float64
Deck_E                       float64
Deck_F                       float64
Deck_G                       float64
D

In [273]:
train_x.dtypes

RoomService                  float64
FoodCourt                    float64
ShoppingMall                 float64
Spa                          float64
VRDeck                       float64
num                          float64
HomePlanet_Earth             float64
HomePlanet_Europa            float64
HomePlanet_Mars              float64
HomePlanet_nan               float64
CryoSleep_False              float64
CryoSleep_True               float64
CryoSleep_nan                float64
Destination_55 Cancri e      float64
Destination_PSO J318.5-22    float64
Destination_TRAPPIST-1e      float64
Destination_nan              float64
VIP_False                    float64
VIP_True                     float64
VIP_nan                      float64
Deck_A                       float64
Deck_B                       float64
Deck_C                       float64
Deck_D                       float64
Deck_E                       float64
Deck_F                       float64
Deck_G                       float64
D

In [274]:

xgb = XGBClassifier()
xgb.fit(train_x,train_y)
pred = xgb.predict(val_x)
print(accuracy_score(val_y, pred))

0.7975460122699386


In [275]:
print(df_x.dtypes)
display(df_to_model)

RoomService                  float64
FoodCourt                    float64
ShoppingMall                 float64
Spa                          float64
VRDeck                       float64
num                          float64
HomePlanet_Earth             float64
HomePlanet_Europa            float64
HomePlanet_Mars              float64
HomePlanet_nan               float64
CryoSleep_False              float64
CryoSleep_True               float64
CryoSleep_nan                float64
Destination_55 Cancri e      float64
Destination_PSO J318.5-22    float64
Destination_TRAPPIST-1e      float64
Destination_nan              float64
VIP_False                    float64
VIP_True                     float64
VIP_nan                      float64
Deck_A                       float64
Deck_B                       float64
Deck_C                       float64
Deck_D                       float64
Deck_E                       float64
Deck_F                       float64
Deck_G                       float64
D

Unnamed: 0,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,num,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,side_nan,Age_0-13,Age_14-25,Age_26-44,Age_45-59,Age_60+,Age_nan,LastName_nan,Group,Id
0,0.0,0.0,0.0,0.0,0.0,False,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,1
1,109.0,9.0,25.0,549.0,44.0,True,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2,1
2,43.0,3576.0,0.0,6715.0,49.0,False,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3,1
3,0.0,1283.0,371.0,3329.0,193.0,False,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3,2
4,303.0,70.0,151.0,565.0,2.0,True,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.0,6819.0,0.0,1643.0,74.0,False,98.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,9276,1
8689,0.0,0.0,0.0,0.0,0.0,False,1499.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,9278,1
8690,0.0,0.0,1872.0,1.0,0.0,True,1500.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,9279,1
8691,0.0,1049.0,0.0,353.0,3235.0,False,608.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,9280,1


In [276]:
corr = df_to_model.corr('pearson', numeric_only=True)
corr.style.background_gradient(cmap ='coolwarm') 

  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,num,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_nan,CryoSleep_False,CryoSleep_True,CryoSleep_nan,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_nan,VIP_False,VIP_True,VIP_nan,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_nan,side_P,side_S,side_nan,Age_0-13,Age_14-25,Age_26-44,Age_45-59,Age_60+,Age_nan,LastName_nan,Group,Id
RoomService,1.0,-0.015889,0.05448,0.01008,-0.019581,-0.244611,-0.008986,-0.139361,-0.068098,0.248013,-0.005029,0.243434,-0.246658,-0.001515,-0.024502,-0.063655,0.06178,-0.003933,-0.050854,0.057173,0.014103,-0.023269,-0.065617,-0.02031,0.162944,0.044106,0.086898,-0.14653,0.007364,0.026772,0.007583,-0.015591,0.026772,-0.074759,-0.003829,0.058503,0.004334,0.010651,-0.043724,,-0.000346,-0.022571
FoodCourt,-0.015889,1.0,-0.014228,0.221891,0.227995,0.046566,-0.183429,-0.211176,0.369477,-0.126142,-0.018768,0.204765,-0.207981,0.000283,0.136412,-0.066214,-0.074411,-0.010415,-0.089639,0.127936,-0.001698,0.119257,0.156435,0.262541,0.020183,-0.03974,-0.13315,-0.151747,0.014134,0.000343,-0.019812,0.0197,0.000343,-0.083448,-0.096449,0.121659,0.044158,0.015449,-0.032223,,-0.009552,0.023391
ShoppingMall,0.05448,-0.014228,1.0,0.013879,-0.007322,0.010141,0.001727,-0.06988,-0.021282,0.112464,-0.008092,0.208465,-0.209265,-0.007289,-0.013522,-0.03176,0.033184,-0.005794,0.001104,0.01872,-0.020022,-0.017418,-0.010739,0.000934,0.055252,0.031655,0.074909,-0.109245,-0.006222,-0.01042,0.022192,-0.019041,-0.01042,-0.062742,0.029212,0.029641,-0.008422,0.003021,-0.041786,,0.017397,-0.038812
Spa,0.01008,0.221891,0.013879,1.0,0.153821,-0.221131,-0.131342,-0.157018,0.269178,-0.089063,-0.010778,0.199075,-0.20053,-0.004711,0.076065,-0.053976,-0.034859,0.007079,-0.037446,0.060888,-0.008229,0.059205,0.115679,0.165828,0.034564,-0.011814,-0.088352,-0.125876,0.036205,0.025807,-0.005801,-0.001964,0.025807,-0.077922,-0.064246,0.075067,0.041559,0.045116,-0.024475,,-0.005211,0.01682
VRDeck,-0.019581,0.227995,-0.007322,0.153821,1.0,-0.207075,-0.13908,-0.15393,0.284801,-0.113128,-0.002701,0.192749,-0.195174,-0.001349,0.090825,-0.045816,-0.047128,-0.010777,-0.087322,0.124351,-0.001257,0.097108,0.123204,0.208311,-0.003495,-0.018676,-0.095165,-0.12863,0.012239,-0.00728,0.009073,-0.00689,-0.00728,-0.072296,-0.069962,0.099102,0.032565,0.004691,-0.02951,,0.015432,0.010082
Transported,-0.244611,0.046566,0.010141,-0.221131,-0.207075,1.0,-0.045097,-0.169019,0.176916,0.019544,0.002712,-0.451744,0.460132,-0.004846,0.108722,9.2e-05,-0.0947,0.000547,0.024602,-0.037261,0.002688,-0.002623,0.144733,0.108193,-0.034046,-0.097965,-0.087753,0.016269,-0.014568,-0.00034,-0.101397,0.101455,-0.00034,0.084842,-0.034593,-0.031739,-0.008397,-0.012186,0.062846,,0.021491,0.06639
num,-0.008986,-0.183429,0.001727,-0.131342,-0.13908,-0.045097,1.0,0.366109,-0.48988,0.062596,0.016265,0.035881,-0.037084,0.002023,-0.142101,0.105092,0.058457,-0.00054,0.059421,-0.098493,0.014884,-0.188939,-0.270447,-0.262918,-0.218505,-0.199181,0.443766,0.191688,-0.028391,,0.037996,-0.037996,,0.065922,0.11538,-0.109029,-0.055721,-0.029415,0.012093,,0.679723,-0.041735
HomePlanet_Earth,-0.139361,-0.211176,-0.06988,-0.157018,-0.15393,-0.169019,0.366109,1.0,-0.604411,-0.534195,-0.163174,0.107932,-0.109133,-0.001297,-0.149534,0.232218,-0.015,0.004267,0.107343,-0.162341,0.011497,-0.18475,-0.332759,-0.325196,-0.25584,-0.052634,0.066562,0.578136,-0.025444,-0.015948,0.020006,-0.015226,-0.015948,0.098933,0.189761,-0.188792,-0.079931,-0.034847,0.02672,,0.002127,-0.073193
HomePlanet_Europa,-0.068098,0.369477,-0.021282,0.269178,0.284801,0.176916,-0.48988,-0.604411,1.0,-0.287022,-0.087673,-0.094665,0.093395,0.008236,0.293517,-0.163308,-0.149679,-0.014224,-0.094805,0.147008,-0.013747,0.299343,0.538379,0.525633,0.080738,-0.077058,-0.392191,-0.368075,0.030943,0.021845,-0.0407,0.034147,0.021845,-0.121965,-0.179003,0.188683,0.094695,0.036098,-0.043816,,0.004945,0.098584
HomePlanet_Mars,0.248013,-0.126142,0.112464,-0.089063,-0.113128,0.019544,0.062596,-0.534195,-0.287022,1.0,-0.077488,-0.031692,0.032715,-0.001669,-0.120996,-0.11126,0.170778,0.010346,-0.033617,0.043523,0.003647,-0.087734,-0.15802,-0.154428,0.23273,0.145291,0.333929,-0.325315,-0.012083,-0.006255,0.02116,-0.019279,-0.006255,0.003424,-0.045795,0.033174,0.001342,0.002727,0.018414,,-0.008802,-0.015418


cryo sleep and expences
agegroups? 
