In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from google.colab import files
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder


In [74]:
uploaded = files.upload()

Saving space_titanic_test.csv to space_titanic_test (5).csv
Saving space_titanic_train.csv to space_titanic_train (5).csv


In [75]:
train_df = pd.read_csv('space_titanic_train.csv')
test_df = pd.read_csv('space_titanic_test.csv')


train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [76]:
# Usunięcie niepotrzebnych kolumn
cols_to_drop = ['Name', 'PassengerId', 'Cabin']
train_df = train_df.drop(columns=cols_to_drop)
test_df = test_df.drop(columns=cols_to_drop)


In [77]:
# Podział na cechy i target
X = train_df.drop('Transported', axis=1)
y = train_df['Transported'].astype(int)

In [78]:
# Imputacja danych numerycznych
num_imputer = SimpleImputer(strategy='mean')
X_num = X.select_dtypes(include=['float64'])
X[X_num.columns] = num_imputer.fit_transform(X_num)
test_df[X_num.columns] = num_imputer.transform(test_df[X_num.columns])

In [79]:
# Imputacja danych kategorycznych
cat_imputer = SimpleImputer(strategy='most_frequent')
X_cat = X.select_dtypes(include=['object', 'bool'])
X[X_cat.columns] = cat_imputer.fit_transform(X_cat)
test_df[X_cat.columns] = cat_imputer.transform(test_df[X_cat.columns])

In [80]:
# Kodowanie zmiennych kategorycznych
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_encoded = encoder.fit_transform(X[X_cat.columns])
test_encoded = encoder.transform(test_df[X_cat.columns])

In [81]:
# Finalne przygotowanie danych
X_final = pd.concat([X[X_num.columns].reset_index(drop=True), pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(X_cat.columns))], axis=1)
X_test_final = pd.concat([test_df[X_num.columns].reset_index(drop=True), pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(X_cat.columns))], axis=1)


In [82]:
# Podział danych na treningowe i walidacyjne
X_train, X_valid, y_train, y_valid = train_test_split(X_final, y, test_size=0.2, random_state=42)


In [83]:
# Trenowanie modelu (Random Forest)
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)

In [84]:
# Ewaluacja
preds = model_rf.predict(X_valid)
print(f'Accuracy: {accuracy_score(y_valid, preds):.3f}')
print(f'Precision: {precision_score(y_valid, preds):.3f}')
print(f'Recall: {recall_score(y_valid, preds):.3f}')
print(f'F1 Score: {f1_score(y_valid, preds):.3f}')
print(f'ROC AUC: {roc_auc_score(y_valid, preds):.3f}')

Accuracy: 0.770
Precision: 0.769
Recall: 0.779
F1 Score: 0.774
ROC AUC: 0.770


regresja logistyczna

In [85]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

model_lr = LogisticRegression(max_iter=2000, random_state=42)
model_lr.fit(X_train_scaled, y_train)


In [86]:
# Ewaluacja Regresji Logistycznej
preds_lr = model_lr.predict(X_valid_scaled)

print('\nLogistic Regression:')
print(f'Accuracy: {accuracy_score(y_valid, preds_lr):.3f}')
print(f'Precision: {precision_score(y_valid, preds_lr):.3f}')
print(f'Recall: {recall_score(y_valid, preds_lr):.3f}')
print(f'F1 Score: {f1_score(y_valid, preds_lr):.3f}')
print(f'ROC AUC: {roc_auc_score(y_valid, preds_lr):.3f}')


Logistic Regression:
Accuracy: 0.776
Precision: 0.764
Recall: 0.805
F1 Score: 0.784
ROC AUC: 0.776


In [89]:
final_predictions = model_rf.predict(X_test_final)
submission = pd.DataFrame({'PassengerId': pd.read_csv('space_titanic_test.csv')['PassengerId'], 'Transported': final_predictions.astype(bool)})
submission.to_csv('submission.csv', index=False)

submission.head()


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
