In [235]:
import pandas as pd
import numpy as np

In [236]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [None]:
df = 

## Preprocessing

In [237]:
train.drop(['Name'], axis=1, inplace=True)
test.drop(['Name'], axis=1, inplace=True)


In [238]:
destination_mapping = {'TRAPPIST-1e': '1', '55 Cancri e': '2', 'PSO J318.5-22' : '3'}

train['Destination'].replace(destination_mapping, inplace=True)
test['Destination'].replace(destination_mapping, inplace=True)


In [239]:
# Assuming your DataFrame is called 'df'
train.columns = [col.lower() for col in train.columns]
test.columns = [col.lower() for col in test.columns]


In [240]:
# Assuming 'train' is your DataFrame
categorical_columns = train.select_dtypes(include='object').columns

for col in categorical_columns:
    mode_value = train[col].mode().iloc[0]  # Calculate the mode
    train[col].fillna(mode_value, inplace=True)  # Impute missing values with the mode
    test[col].fillna(mode_value, inplace=True)  # Impute missing values with the mode

# Now 'train' contains no missing values in the categorical columns


In [241]:
train.isna().sum()

homeplanet        0
cryosleep         0
cabin             0
destination       0
age             179
vip               0
roomservice     181
foodcourt       183
shoppingmall    208
spa             183
vrdeck          188
transported       0
dtype: int64

In [242]:
numeric_columns = train.select_dtypes(include='number').columns  # Replace with actual column names

for col in numeric_columns:
    median_value = train[col].median()  # Calculate the median
    train[col].fillna(median_value, inplace=True)  # Impute missing values with the median
    test[col].fillna(median_value, inplace=True)  # Impute missing values with the median



In [243]:
train.isna().sum()

homeplanet      0
cryosleep       0
cabin           0
destination     0
age             0
vip             0
roomservice     0
foodcourt       0
shoppingmall    0
spa             0
vrdeck          0
transported     0
dtype: int64

In [244]:
train.nunique()

homeplanet         3
cryosleep          2
cabin           6560
destination        3
age               80
vip                2
roomservice     1273
foodcourt       1507
shoppingmall    1115
spa             1327
vrdeck          1306
transported        2
dtype: int64

In [245]:

columns_to_encode = ['homeplanet', 'destination']

train = pd.get_dummies(train, columns=columns_to_encode)
test = pd.get_dummies(test, columns=columns_to_encode)



In [246]:
binary_cols = ['cryosleep', 'vip']
train[binary_cols] = train[binary_cols].astype(int)
test[binary_cols] = test[binary_cols].astype(int)


In [247]:
train['transported'] = train['transported'].astype(int)


In [248]:
train.drop('cabin', axis=1, inplace=True)
test.drop('cabin', axis=1, inplace=True)

In [249]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()


In [251]:
numeric_columns = ['age', 'roomservice', 'foodcourt', 'shoppingmall', 'spa', 'vrdeck']

In [252]:
# Assuming 'train' is your training DataFrame
train[numeric_columns] = scaler.fit_transform(train[numeric_columns])

# Assuming 'test' is your test DataFrame
test[numeric_columns] = scaler.transform(test[numeric_columns])


In [255]:
X = train.drop('transported', axis=1)
y = train.transported

In [258]:
y

0       0
1       1
2       0
3       0
4       1
       ..
8688    0
8689    0
8690    1
8691    0
8692    1
Name: transported, Length: 8693, dtype: int64

In [262]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [265]:
from sklearn.linear_model import LogisticRegression


model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [268]:
from sklearn.metrics import classification_report


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.83      0.78       862
           1       0.81      0.71      0.76       877

    accuracy                           0.77      1739
   macro avg       0.77      0.77      0.77      1739
weighted avg       0.77      0.77      0.77      1739



In [270]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    objective='binary:logistic',  # For binary classification
    eval_metric='logloss',  # Logarithmic loss as the evaluation metric
    use_label_encoder=False,  # Avoid warnings about deprecated label encoding
    random_state=42
)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9]
}
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='f1',  # F1-score as the evaluation metric
    cv=5,  # 5-fold cross-validation
    verbose=1,
    n_jobs=-1  # Use all available CPU cores
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_xgb_model = grid_search.best_estimator_

# Evaluate on the test set
y_pred = best_xgb_model.predict(X_test)
# Compute F1-score 
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)

print("Best hyperparameters:", best_params)
print("F1-score on test set:", f1)



Fitting 5 folds for each of 108 candidates, totalling 540 fits



`use_label_encoder` is deprecated in 1.7.0.


`use_label_encoder` is deprecated in 1.7.0.


`use_label_encoder` is deprecated in 1.7.0.



Best hyperparameters: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
F1-score on test set: 0.8136882129277566


In [282]:
submission = pd.DataFrame(best_xgb_model.predict(test), columns=['transported'])

In [283]:
submission['transported'] = submission.transported.astype(bool)

In [285]:
tst = pd.read_csv('../data/test.csv')

In [291]:
submission

Unnamed: 0,transported
0,True
1,False
2,True
3,True
4,True
...,...
4272,True
4273,False
4274,True
4275,True


In [290]:
tst

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [292]:
concatenated_df = pd.concat([tst['PassengerId'], submission] ,axis=1)


In [298]:
concatenated_df.rename(columns={'transported': 'Transported', }, inplace=True)


In [296]:
sma = pd.read_csv('sample_submission.csv')

In [297]:
sma

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False


In [300]:
import pandas as pd

# Assuming 'df' is your DataFrame
concatenated_df.to_csv('prediction.csv', index=False)
