In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
data = pd.read_csv('train.csv')

In [3]:
def sep_PID(data):
    
    """Data = Dataset to work on. Will compare how many passengers are in a group and will write their frequencies 
    inplace of PassengerId. (Need further memory optimisation.)"""
    
    data['PassengerId'] = [i.split('_')[0] for i in data['PassengerId']]
    
    d = {}
    uq = []

    for i in range(len(data)):

        if data['PassengerId'][i] not in uq:
            d[data['PassengerId'][i]] = 0
            uq.append(data['PassengerId'][i])

        if data['PassengerId'][i] in uq:
            d[data['PassengerId'][i]] += 1
    
    
    data['Group'] = 0
    
    for i in range(len(data)):

        if data['PassengerId'][i] in d:
            data['Group'][i] = d[data['PassengerId'][i]]
    
    return data

In [4]:
def sep_Cabin(data):
    
    """data = Dataset to work on. Splits the cabin column into Deck, Seat number, and side the passenger is 
    travelling. (Need further memory optimisation. )"""
    
    data['Deck'] = 0
    data['Num'] = 0
    data['Side'] = 0
    
    for i in range(len(data)):
    
        if pd.isnull(data['Cabin'][i]) is False:
            data['Deck'][i] = data['Cabin'][i].split('/')[0]
            data['Num'][i] = data['Cabin'][i].split('/')[1] 
            data['Side'][i] = data['Cabin'][i].split('/')[2] 

        else:

            data['Deck'][i] = np.nan
            data['Num'][i] = np.nan
            data['Side'][i] = np.nan
        
    return data

In [5]:
#Replacing column with mean
def _col(data, col, tp):
    
    """data = Dataset to work on. 
       col = The column to work on. 
       tp = Type of statistical operation: 'Mean/Median/Mode'."""
    
    if tp == 'mean':
        data[col] = data[col].fillna(data[col].mean())
        
    if tp == 'mode':    
        data[col] = data[col].fillna(data[col].mode()[0])
    
    if tp == 'median':
        data[col] = data[col].fillna(data[col].median())
    
    return data[col]

def rep_(data, cols, tp):
    
    """data = Dataset to work on. 
       cols = List of columns consisting of column names. 
       tp = Type of statistical operation: 'Mean/Median/Mode'."""
    
    for i in range(len(cols)):

        data[cols[i]] = _col(data, cols[i], tp)
    
    return data

In [6]:
def encod(data, col):
    
    """data = Dataset to work on. 
       col = Column name to apply function. """
    
    dum = pd.get_dummies(data[col])
    
    return dum

def add_enc(data, cols):
    
    """data = Dataset to work on. 
       cols = List of column names to apply encoding using dummy. """
    
    for i in range(len(cols)):
        
        data = pd.concat([data, encod(data, cols[i])], axis=1)
        
    return data

In [7]:
def scal(data, scalar, col):
    
    """data = Dataset to work on.
       scalar = MinMaxScalar object.
       col = The column name to scale. """
    
    c = data[col].values.reshape(-1, 1)
    sc = scalar.fit_transform(c)
    data[col] = sc
    
    return data[col]

def col_scal(data, scalar, cols):
    
    """data = The dataset to work on.
       scalar = MinMaxScalar object. 
       cols = List of columns to apply scaling. """
    
    for i in range(len(cols)):
        
        data[cols[i]] = scal(data, scalar, cols[i])
        
    return data

In [8]:
#Separating Passenger ID
data = sep_PID(data)

In [9]:
#Separating Cabin
data = sep_Cabin(data)

In [10]:
#Filling Null Values
cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Num']
mod_col = ['HomePlanet', 'CryoSleep', 'Destination', 'Deck', 'Side']
med_col = ['Age']

data = rep_(data, cols, 'mean')
data = rep_(data, mod_col, 'mode')
data = rep_(data, med_col, 'median')

In [11]:
#Encoding Columns
enc_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'Deck', 'Side']
data = add_enc(data, enc_cols)

In [12]:
#Dropping unnecessary columns
drop_col = ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 
            'Name', 'Deck', 'Side']
data.drop(drop_col, axis=1, inplace=True)

In [13]:
#Scaling columns
scal_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Num', 'Group']
data = col_scal(data, MinMaxScaler(), scal_cols)

In [14]:
#Splitting the data

In [15]:
X = data.drop('Transported', axis=1)
Y = data['Transported']

In [43]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.05)

# Logistic Regression

In [17]:
lr = LogisticRegression()

In [18]:
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [2000, 100, 10]

In [19]:
LRgrid = dict(solver=solvers,penalty=penalty,C=c_values)

In [20]:
LRcv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [21]:
LRgrid_search = GridSearchCV(estimator=lr, param_grid=LRgrid, n_jobs=-1, cv=LRcv, scoring='accuracy',error_score=0)

In [22]:
LRgrid_result = LRgrid_search.fit(X, Y)

In [23]:
print("Best: %f using %s" % (LRgrid_result.best_score_, LRgrid_result.best_params_))

Best: 0.792399 using {'C': 2000, 'penalty': 'l2', 'solver': 'newton-cg'}


In [24]:
means = LRgrid_result.cv_results_['mean_test_score']
stds = LRgrid_result.cv_results_['std_test_score']
params = LRgrid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.792399 (0.014421) with: {'C': 2000, 'penalty': 'l2', 'solver': 'newton-cg'}
0.791555 (0.014684) with: {'C': 2000, 'penalty': 'l2', 'solver': 'lbfgs'}
0.792322 (0.014458) with: {'C': 2000, 'penalty': 'l2', 'solver': 'liblinear'}
0.790098 (0.014239) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.789830 (0.014591) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.790136 (0.014204) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.786839 (0.014551) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.786532 (0.014596) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.786839 (0.014551) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.768740 (0.011115) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.768510 (0.011019) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.768664 (0.011212) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.746616 (0.011549) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.746616 (0.011

In [29]:
lr = LogisticRegression(penalty='l2', C=2000, solver='newton-cg')

In [44]:
lr.fit(X_train, Y_train)

LogisticRegression(C=2000, solver='newton-cg')

In [45]:
lrp = lr.predict(X_test)

In [46]:
print(confusion_matrix(Y_test, lrp))
print(classification_report(Y_test, lrp))

[[152  48]
 [ 49 186]]
              precision    recall  f1-score   support

       False       0.76      0.76      0.76       200
        True       0.79      0.79      0.79       235

    accuracy                           0.78       435
   macro avg       0.78      0.78      0.78       435
weighted avg       0.78      0.78      0.78       435



# Random Forest

In [73]:
rf = RandomForestClassifier()

In [90]:
n_estimators = [1000, 2000, 3000, 4000, 5000]
max_features = ['sqrt']
criterion = ['gini', 'entropy']

In [91]:
RFgrid = dict(n_estimators=n_estimators, max_features=max_features, criterion = criterion)
RFcv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
RFgrid_search = GridSearchCV(estimator=rf, param_grid=RFgrid, n_jobs=-1, cv=RFcv, scoring='accuracy',error_score=0)
RFgrid_result = RFgrid_search.fit(X, Y)

In [92]:
print("Best: %f using %s" % (RFgrid_result.best_score_, RFgrid_result.best_params_))

Best: 0.804094 using {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 1000}


In [93]:
means = RFgrid_result.cv_results_['mean_test_score']
stds = RFgrid_result.cv_results_['std_test_score']
params = RFgrid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.802675 (0.013741) with: {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 1000}
0.803365 (0.013068) with: {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 2000}
0.802483 (0.013826) with: {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 3000}
0.802369 (0.013059) with: {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 4000}
0.802521 (0.013205) with: {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 5000}
0.804094 (0.014030) with: {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 1000}
0.803442 (0.013555) with: {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 2000}
0.803212 (0.014108) with: {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 3000}
0.803365 (0.014263) with: {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 4000}
0.803557 (0.014323) with: {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 5000}


In [94]:
rf = RandomForestClassifier(n_estimators=1000, max_features='sqrt', criterion='entropy')

In [98]:
rf.fit(X_train, Y_train)

RandomForestClassifier(criterion='entropy', max_features='sqrt',
                       n_estimators=1000)

In [99]:
rfp = rf.predict(X_test)

In [100]:
print(confusion_matrix(Y_test, rfp))
print(classification_report(Y_test, rfp))

[[165  35]
 [ 47 188]]
              precision    recall  f1-score   support

       False       0.78      0.82      0.80       200
        True       0.84      0.80      0.82       235

    accuracy                           0.81       435
   macro avg       0.81      0.81      0.81       435
weighted avg       0.81      0.81      0.81       435



# Cleaning Test set and using RF

In [101]:
test = pd.read_csv('test.csv')
temp_test = test.copy()

In [102]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [103]:
test = sep_PID(test)
test = sep_Cabin(test)

In [104]:
#Filling Null Values
cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Num']
mod_col = ['HomePlanet', 'CryoSleep', 'Destination', 'Deck', 'Side']
med_col = ['Age']

test = rep_(test, cols, 'mean')
test = rep_(test, mod_col, 'mode')
test = rep_(test, med_col, 'median')

In [105]:
#Encoding Columns
enc_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'Deck', 'Side']
test = add_enc(test, enc_cols)

In [106]:
#Dropping unnecessary columns
drop_col = ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 
            'Name', 'Deck', 'Side']
test.drop(drop_col, axis=1, inplace=True)

In [107]:
#Scaling columns
scal_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Num', 'Group']
test = col_scal(test, MinMaxScaler(), scal_cols)

In [108]:
#Predicting using Random Forest
test_predict = rf.predict(test)

In [109]:
#Submission dataframe

submit = pd.DataFrame(columns=['PassengerId', 'Transported'])
submit['PassengerId'] = temp_test['PassengerId']
submit['Transported'] = test_predict

In [110]:
submit.to_csv('Predictions.csv', index=False)

In [111]:
pd.read_csv('Predictions.csv')

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
