In [1]:
import pandas as pd 

In [2]:
data = pd.read_csv(r"C:\Users\Harsha\OneDrive\Desktop\dataverse23\train.csv")

In [3]:
data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [4]:
# Imputing missing values
# For categorical data, we will fill missing values with the mode (most frequent value)
# For numerical data, we will fill missing values with the median

# Categorical columns
for column in ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']:
    data[column].fillna(data[column].mode()[0], inplace=True)

# Numerical columns
for column in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    data[column].fillna(data[column].median(), inplace=True)

# Check if there are any missing values left
missing_values_after_imputation = data.isnull().sum()
missing_values_after_imputation[missing_values_after_imputation > 0]

Series([], dtype: int64)

In [5]:
data['HomePlanet']

0       Europa
1        Earth
2       Europa
3       Europa
4        Earth
         ...  
8688    Europa
8689     Earth
8690     Earth
8691    Europa
8692    Europa
Name: HomePlanet, Length: 8693, dtype: object

In [6]:
data.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [8]:
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns

In [9]:
categorical_cols = data.select_dtypes(include=['object', 'bool']).columns


In [10]:
categorical_cols = categorical_cols.drop(['PassengerId', 'Name', 'Transported'])

In [11]:
categorical_cols

Index(['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP'], dtype='object')

In [12]:
numerical_cols

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')

In [13]:
numerical_transformer = StandardScaler()

In [14]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [16]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=0)

In [17]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

In [18]:
y = data['Transported']
X = data.drop(['Transported', 'PassengerId', 'Name'], axis=1)

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [20]:
pipeline.fit(X_train, y_train)

In [22]:
X_train

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
4278,Europa,False,C/167/S,55 Cancri e,54.0,False,0.0,559.0,0.0,15238.0,2799.0
5971,Earth,False,F/1307/P,TRAPPIST-1e,20.0,False,0.0,20.0,1.0,696.0,0.0
464,Mars,False,F/90/S,TRAPPIST-1e,43.0,False,1821.0,0.0,47.0,29.0,0.0
4475,Earth,False,F/896/S,TRAPPIST-1e,24.0,False,185.0,0.0,476.0,1810.0,53.0
8469,Europa,True,C/335/S,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
4373,Europa,True,B/154/P,55 Cancri e,32.0,False,0.0,0.0,0.0,0.0,0.0
7891,Earth,False,F/1620/S,TRAPPIST-1e,22.0,False,0.0,0.0,6.0,0.0,733.0
4859,Mars,False,E/330/S,TRAPPIST-1e,29.0,False,523.0,0.0,21.0,4.0,811.0
3264,Earth,False,G/574/P,TRAPPIST-1e,0.0,False,0.0,0.0,0.0,0.0,0.0


In [23]:
preds = pipeline.predict(X_test)

In [24]:
preds

array([False,  True, False, ...,  True, False, False])

In [25]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, preds)
print('Accuracy:', score)

Accuracy: 0.7768832662449684


In [27]:

from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print('Best parameters:', best_params)
print('Best score:', best_score)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 200}
Best score: 0.7909120812623806


In [28]:
# Now that we have the best parameters, let's perform cross-validation with the tuned hyperparameters to validate the model's performance.
from sklearn.model_selection import cross_val_score

# Redefine the model with the best parameters
best_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=1,
    random_state=0
)

# Create a new pipeline with the best model
pipeline_best = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', best_model)])

# Perform cross-validation
scores = cross_val_score(pipeline_best, X, y, cv=5)

# Calculate the mean and standard deviation of the cross-validation scores
mean_score = scores.mean()
std_dev_score = scores.std()

print('Cross-validation scores:', scores)
print('Mean score:', mean_score)
print('Standard deviation:', std_dev_score)

# For backtesting, we can use the initial train-test split we did and evaluate the model on the test set
pipeline_best.fit(X_train, y_train)
backtest_preds = pipeline_best.predict(X_test)
backtest_score = accuracy_score(y_test, backtest_preds)

print('Backtesting score on the test set:', backtest_score)

Cross-validation scores: [0.78953422 0.78320874 0.78205865 0.80379747 0.80034522]
Mean score: 0.7917888605742093
Standard deviation: 0.008840778461920745
Backtesting score on the test set: 0.7855089131684876


In [29]:
# Let's try to retrieve the feature names using the 'get_feature_names' method instead.
# We will also ensure that the pipeline is fitted before calling this method.
pipeline_best.fit(X_train, y_train)

# Get feature names after one-hot encoding
# Check if the OneHotEncoder has attribute 'get_feature_names_out', if not use 'get_feature_names'
if hasattr(pipeline_best.named_steps['preprocessor'].named_transformers_['cat'], 'get_feature_names_out'):
    encoded_features = list(pipeline_best.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out())
else:
    encoded_features = list(pipeline_best.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names())

feature_names = numerical_cols.tolist() + encoded_features

# Get the feature importances
importances = pipeline_best.named_steps['model'].feature_importances_

# Create a DataFrame for feature importances
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

print(feature_importances.head(10))

              feature  importance
4                 Spa    0.069306
10     CryoSleep_True    0.063090
1         RoomService    0.063011
5              VRDeck    0.059464
2           FoodCourt    0.057379
9     CryoSleep_False    0.055537
3        ShoppingMall    0.050168
0                 Age    0.049818
6    HomePlanet_Earth    0.018880
7   HomePlanet_Europa    0.016086


In [42]:
# Load test data
test_data = pd.read_csv(r"C:\Users\Harsha\OneDrive\Desktop\dataverse23\test.csv")

# Handle missing values in the test data
for column in ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']:
    test_d[column].fillna(test_d[column].mode()[0], inplace=True)

for column in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    test_d[column].fillna(test_d[column].median(), inplace=True)

# Separate target from predictors
X_test = test_d.drop(['PassengerId', 'Name'], axis=1)

# Load your trained model
# Ensure that 'pipeline' is defined and has been fitted to the training data before this point

# Make predictions
test_predictions = pipeline.predict(X_test)

# Convert predictions to True and False
boolean_test_predictions = (test_predictions == 1)

# Create a DataFrame with the predicted values and 'PassengerId'
submission_df = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Transported_Predicted': boolean_test_predictions})

# Save predictions to a file
submission_df.to_csv('submission1.csv', index=False)