# 1. IMPORTING LIBRARIES AND LOADING DATA

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
sample_sub = pd.read_csv('./sample_submission.csv')

y = train_df.pop('Transported') # label column has been separated from train dataframe

In [2]:
train_df.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic


In [3]:
train_df.shape,test_df.shape,sample_sub.shape

((8693, 13), (4277, 13), (4277, 2))

# 2. DATA WRANGLING AND PREPROCESSING

In [4]:
all_df = pd.concat([train_df,test_df],ignore_index=True)
all_df.isna().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
dtype: int64

In [5]:
all_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
12966,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
12967,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
12968,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [18]:
all_df[['Deck','Num','Side']] = all_df['Cabin'].str.split('/',expand = True)

In [19]:
all_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Deck,Num,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,F,1,S


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
imp_freq = SimpleImputer(strategy='most_frequent')
cols_to_impute = ['HomePlanet','CryoSleep','Destination','VIP','Deck','Num','Side']
imp_freq.fit_transform()



In [17]:

for i in ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']:
    all_df[i].fillna(all_df[i].mean(),inplace=True)
all_df


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.000000,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.000000,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.000000,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.000000,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.000000,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.000000,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
12966,9269_01,Earth,False,,TRAPPIST-1e,42.000000,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
12967,9271_01,Mars,True,D/296/P,55 Cancri e,28.771969,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
12968,9273_01,Europa,False,D/297/P,,28.771969,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [18]:
# check empty
all_df.isna().sum()

HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
dtype: int64

# FEATURE ENGINEERING

In [None]:
all_df.drop(columns=['PassengerId','Name'],inplace=True)


In [6]:

categorical_columns = all_df.select_dtypes(include='object').columns.values
numerical_columns = all_df.select_dtypes(include='number').columns.values

Categorical columns are: ['PassengerId' 'HomePlanet' 'CryoSleep' 'Cabin' 'Destination' 'VIP' 'Name']
Numerical columns are: ['Age' 'RoomService' 'FoodCourt' 'ShoppingMall' 'Spa' 'VRDeck']


# 4. MODELLING

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV,StratifiedKFold,cross_val_score,cross_val_predict
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
X = train
X_train,X_val, y_train, y_val = train_test_split(X,y, random_state=0,test_size=.2,stratify=y)



class Model:
    '''
    parameters -> 
        model : the name of the estimator(model) is passed here
        params_dict : the parameter dictionary is passed here
        
    methods -> 
        fit: fits the model 
        cross_validate(cv=k): performs KFold cross validation on the model
        
    WARNING: Provided the training and testing splits are already provided outside the class
    
    '''
    def __init__(self,model):
        self.model = model
        print(self.model.__class__)
        
    def fit(self):
        self.model.fit(X_train,y_train)
        train_accuracy,validation_accuracy = self.model.score(X_train,y_train),self.model.score(X_val,y_val)
        print(f'Train accuracy: {train_accuracy:.4f}')
        print(f'Validation accuracy: {validation_accuracy :.4f}')
        print(f'Overfit: {train_accuracy - validation_accuracy:.4f}')
        
    def prediction(self):
        return self.model.predict(X_val)
        
    def cross_validate(self,cv=5):
        kf = StratifiedKFold(n_splits=cv)
        print('Cross validation score:', np.mean(cross_val_score(self.model,X_train,y_train,cv=kf),axis=0))
    
    def tune(self, grid_params,k = 5):
        grid = GridSearchCV(self.model,grid_params, cv=k , scoring='accuracy',n_jobs=-1)
        print('\n================ EXHAUSTIVE GRID SEARCH ==================')
        grid.fit(X_train,y_train)
        print(grid.best_estimator_)# printing this as the output to prevent time wasted incase i decide to re-run this entire notebook
        
        
    def save(self,file_name):
        X_test = test # test set 
        sub_file = sample_sub.copy()
        sub_file['Transported'] = self.model.predict(X_test)
        sub_file.to_csv(file_name,index=False)
        print('Prediction saved at', './' + file_name)
    
    def CM(self):
        metrics.plot_confusion_matrix(self.model,X_val,y_val)
        
        
        
    
    
        

NameError: name 'train' is not defined

In [None]:
# logistic regression
lr = Model(LogisticRegression(C=0.1, solver='liblinear'))
lr.fit()


lr_params = {
    'solver':['liblinear','saga'],
    'C':[.001,.01,.1,1,10,100],
}

lr.tune(lr_params)


In [None]:
gboost = Model(GradientBoostingClassifier(random_state = 34))
gboost.fit()

gboost_grid = {
    'max_depth':[1,2,3,4,5],
    'n_estimators':[200,100,120],
    'learning_rate':[.01,.001,.1,1]
}

tuned_gboost = gboost.tune(gboost_grid)
print(tuned_gboost)



# ESKAY EXITING :) PEACE OUT