In [340]:
import warnings

warnings.filterwarnings('ignore')

In [341]:
# Useful packages

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [342]:
# Data loading

train_titanic=pd.read_csv("train.csv")

test_titanic=pd.read_csv("test.csv")

In [343]:
train_titanic.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [344]:
train_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [345]:
test_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


## Data description

We have eleven variables:

    - PassengerId: Primary key that are unique for each passenger
    - Survived: feature that gives information about the status of a passenger after accident,
    (0/1)-(Died/Survived)
    - Pclass: The class that the passager was embarked
    - Name: Name of the passenger
    - Sex: gender of the passenger
    - Age : Age of passenger
    - SibSp: Number of siblings/spouse on board of titanic
    - Parch: Number of parent/children on board of titanic
    - Ticket: Ticket number
    - Fare: Price of passenger's ticket 
    - Cabin: Cabin number
    - Embarked: Port of embarkation

### Feature engineering

As we have a contraint to respect the submission file structure and his shape. We can't drop any line. Then we will deal with missing value by trying different method of filling these missing and consider the best predictablility outcome.

In [346]:
# Fill embarked nan value

train_titanic["Embarked"]=train_titanic["Embarked"].fillna(train_titanic["Embarked"].mode().iloc[0])
test_titanic["Embarked"]=test_titanic["Embarked"].fillna(test_titanic["Embarked"].mode().iloc[0])

In [347]:
# According to the evolution of distribution before and after filling we will consider ["nearest","linear"]

def fill_nan_age(df,method):
    
    df["Age"]=df["Age"].interpolate(method =method, limit_direction ='forward')

    return df

train_titanic=fill_nan_age(train_titanic,method='nearest')

test_titanic=fill_nan_age(test_titanic,method="nearest")

test_titanic=fill_nan_age(test_titanic,method='barycentric')


In [348]:
## Fill fare NaN in test_titanic
test_titanic[test_titanic["Fare"].isna()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [349]:
test_titanic[test_titanic["Pclass"]==3][test_titanic["Age"]>=58]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S
344,1236,3,"van Billiard, Master. James William",male,58.0,1,1,A/5. 851,14.5,,S
357,1249,3,"Lockyer, Mr. Edward",male,59.0,0,0,1222,7.8792,,S


In [350]:
#Based on Age and Pclass, I will with fare value that is near of 
# the age of missing observation and according to the class related
test_titanic["Fare"]=test_titanic["Fare"].replace(np.NaN,14.5)

#### Hypothesis

Is it possible to predict cabin based on they paid and the chance of surviving?

### Question
How to split Fare to match the correct Cabin?

In [351]:
#https://titanic.fandom.com/wiki/First_Class_Staterooms#A_Deck

class_1=['A','B','C','D','E']
privileged=['B51','B53','B54','B55','B56']
class_2=[]
class_3=[]

#train_titanic["Cabin"].unique()

In [352]:
train_titanic["Cabin"].dropna().nunique()/train_titanic.shape[0]

0.16498316498316498

We have just 16.5% of unique cabin number. Then it will be interesting if we have at least one available value for cabin label for each unique prices.

In [353]:
train_titanic["Fare"].nunique()/train_titanic.shape[0]

0.2783389450056117

So interesting because there are many redundant price then it mean that cabin and class will be positive associated with Fare then we can fill `NaN` with more sure value.

In [354]:
def check_list_items(l):

    for item in l:

        if str(item)!='nan':

            return item

    return np.NaN
    
def get_unique_cabin_per_fare(train_titanic):
    
    final_fare_label=dict()

    train_titanic["Fare"]=train_titanic["Fare"].astype('int64')

    nan_cabin_fare=[]

    for fare in train_titanic[train_titanic["Cabin"].isna()]["Fare"].unique():
        
        relate_cabin=[]

        for cabin in train_titanic[train_titanic["Fare"]==fare]["Cabin"]:

            if cabin not in relate_cabin:

                relate_cabin.append(cabin)
        
        final_fare_label[fare]=check_list_items(relate_cabin)
    
    return final_fare_label

In [355]:
final_fare_label_train=get_unique_cabin_per_fare(train_titanic)
final_fare_label_test=get_unique_cabin_per_fare(test_titanic)

In [356]:
def get_not_defined_cabin_fare(final_fare_label):

    fares_nan=[]
    fares_defined={}

    for fare in final_fare_label.keys():

        if str(final_fare_label[fare])=='nan':

            fares_nan.append(fare)
        
        else:

            fares_defined[fare]=final_fare_label[fare]

    return fares_defined,fares_nan

In [357]:
fares_defined_train,fares_nan_train=get_not_defined_cabin_fare(final_fare_label_train)
fares_defined_test,fares_nan_test=get_not_defined_cabin_fare(final_fare_label_test)

In [358]:
#

def find_most_likely_cabin(fares_defined,fares_nan):

    most_likely_cabin={}

    for fare in fares_nan:

        difference=abs(list(fares_defined.keys())-fare)
        
        index=list(difference).index(min(difference))

        most_likely_cabin[fare]=list(fares_defined.values())[index]

    return most_likely_cabin

In [359]:
most_likely_cabin_train=find_most_likely_cabin(fares_defined_train,fares_nan_train)
most_likely_cabin_test=find_most_likely_cabin(fares_defined_test,fares_nan_test)

In [360]:
fares_defined_train.update(most_likely_cabin_train)

fares_defined_test.update(most_likely_cabin_test)

In [361]:
#Creating new columns for cabin

def construct_new_cabin_col(train_titanic,fares_defined):

    col_Cabin={'Fare':[]}

    for fare,cabin in zip(train_titanic["Fare"],train_titanic["Cabin"]):

        if str(cabin)=='nan':

            col_Cabin['Fare'].append(fares_defined[fare])

        else:
            
            col_Cabin['Fare'].append(cabin)
    
    return col_Cabin

In [362]:
col_cabin_train=construct_new_cabin_col(train_titanic,fares_defined_train)

col_cabin_test=construct_new_cabin_col(test_titanic,fares_defined_test)

In [363]:
train_titanic["Cabin_new"]=col_cabin_train["Fare"]
test_titanic["Cabin_new"]=col_cabin_test["Fare"]

In [364]:
train_titanic.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Cabin_new        0
dtype: int64

In [365]:
test_titanic.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Cabin_new        0
dtype: int64

In [366]:
train_titanic.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_new
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7,,S,F G73
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71,C85,C,C85


### Encodage des variables

In [367]:
#Train data

train_titanic['Pclass']=train_titanic['Pclass'].astype('category')

train_titanic=train_titanic.drop(columns=["Name","Cabin","Ticket"])

# Test data

test_titanic['Pclass']=test_titanic['Pclass'].astype('category')

test_titanic=test_titanic.drop(columns=["Name","Cabin","Ticket"])

In [368]:

train_titanic['Cabin_new']=train_titanic['Cabin_new'].apply(lambda x:str(x).strip()[0])

test_titanic['Cabin_new']=test_titanic['Cabin_new'].apply(lambda x:str(x).strip()[0])

In [369]:
train_titanic[train_titanic["Fare"]==35]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_new
23,24,1,1,male,28.0,0,0,35,S,A
55,56,1,1,male,65.0,0,0,35,S,C
339,340,0,1,male,45.0,0,0,35,S,T
351,352,0,1,male,23.0,0,0,35,S,C
647,648,1,1,male,56.0,0,0,35,C,A


In [370]:
# As there are no cabin with T in test set then to keep the same format in test and train set
# We will replace T by C that is more similar to this view based on other attributes

train_titanic['Cabin_new']=train_titanic['Cabin_new'].replace('T','C') 

In [371]:
train_titanic["Cabin_new"].unique()

array(['F', 'C', 'E', 'A', 'G', 'D', 'B'], dtype=object)

In [372]:
train_titanic_numeric=pd.get_dummies(train_titanic.drop(columns=["PassengerId"]))

test_titanic_numeric=pd.get_dummies(test_titanic.drop(columns=["PassengerId"]))

train_titanic_numeric.shape,test_titanic_numeric.shape

((891, 20), (418, 19))

### Modelisation

In [373]:
from sklearn.linear_model import LogisticRegression

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report,confusion_matrix

In [374]:
lr_model=LogisticRegression(penalty="l2",class_weight="balanced")
LDA=LinearDiscriminantAnalysis()
KNN=KNeighborsClassifier(n_neighbors=2,n_jobs=-1)

In [375]:
y=train_titanic_numeric["Survived"]
X=train_titanic_numeric.drop(columns=["Survived"])

In [376]:
X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=.1)

In [377]:
lr_model.fit(X_train,y_train)
LDA.fit(X_train,y_train)
KNN.fit(X_train,y_train)


In [378]:
lr_model.score(X_train,y_train),LDA.score(X_train,y_train),KNN.score(X_train,y_train)

(0.7965043695380774, 0.8077403245942572, 0.8289637952559301)

# Optimisation

In [379]:
from sklearn.model_selection import GridSearchCV

### Logistic Regression

In [380]:
grid={"C":np.logspace(-3,3,10), "penalty":["l1","l2","elasticnet"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)

print("Accuracy for our training dataset with tuning is : {:.2%}".format(logreg_cv.best_score_) )

tuned hpyerparameters :(best parameters)  {'C': 0.1, 'penalty': 'l2'}
Accuracy for our training dataset with tuning is : 80.90%


In [381]:
y_predict=logreg_cv.predict(X_valid)

print(classification_report(y_valid,y_predict))

              precision    recall  f1-score   support

           0       0.70      0.85      0.77        47
           1       0.79      0.60      0.68        43

    accuracy                           0.73        90
   macro avg       0.74      0.73      0.73        90
weighted avg       0.74      0.73      0.73        90



### Submission

In [382]:
submi_lr=pd.read_csv("gender_submission.csv")

submi_lr["Survived"]=logreg_cv.predict(test_titanic_numeric)

In [383]:
submi_lr.to_csv("submi_lr.csv",index=False)

### KNeighborsClassifier

In [384]:
KNN = KNeighborsClassifier()
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)
  
# defining parameter range
grid = GridSearchCV(KNN, param_grid, cv=10, scoring='accuracy', return_train_score=False,verbose=1)
  
# fitting the model for grid search
grid_search_knn=grid.fit(X_train, y_train)


print(grid_search_knn.best_params_)

print("Accuracy for our training dataset with tuning is : {:.2%}".format(grid_search_knn.best_score_) )

Fitting 10 folds for each of 30 candidates, totalling 300 fits
{'n_neighbors': 5}
Accuracy for our training dataset with tuning is : 71.28%


In [385]:
y_predict=grid_search_knn.predict(X_valid)

print(classification_report(y_valid,y_predict))

              precision    recall  f1-score   support

           0       0.62      0.83      0.71        47
           1       0.70      0.44      0.54        43

    accuracy                           0.64        90
   macro avg       0.66      0.64      0.63        90
weighted avg       0.66      0.64      0.63        90



### Submission

In [386]:
submi_knn=pd.read_csv("gender_submission.csv")

submi_knn["Survived"]=grid_search_knn.predict(test_titanic_numeric)

In [387]:
submi_knn.to_csv("submi_knn.csv",index=False)

### LinearDiscriminantAnalysis 

In [388]:
grid=dict()
grid['solver']=['svd', 'lsqr', 'eigen']

lda_search=GridSearchCV(LDA, grid, scoring='accuracy', cv=10, n_jobs=-1 )

lda_search.fit(X_train,y_train)

print(lda_search.best_params_)

print("Accuracy for our training dataset with tuning is : {:.2%}".format(lda_search.best_score_) )

{'solver': 'svd'}
Accuracy for our training dataset with tuning is : 79.65%


In [389]:
y_predict=lda_search.predict(X_valid)

print(classification_report(y_valid,y_predict))

              precision    recall  f1-score   support

           0       0.69      0.85      0.76        47
           1       0.78      0.58      0.67        43

    accuracy                           0.72        90
   macro avg       0.74      0.72      0.71        90
weighted avg       0.73      0.72      0.72        90



### Submission

In [390]:
submi_lda=pd.read_csv("gender_submission.csv")
submi_lda["Survived"]=lda_search.predict(test_titanic_numeric)

In [391]:
submi_lda.to_csv("submi_lda.csv",index=False)