In [25]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [26]:
titanic_data=pd.read_excel("data/titanic3.xls")

In [27]:
titanic_data

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [28]:
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2)
for train_indices,test_indices in split.split(titanic_data,titanic_data[["survived","pclass","sex"]]):
    strat_train_set=titanic_data.loc[train_indices]  
    strat_test_set=titanic_data.loc[test_indices]

In [29]:
strat_train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1047 entries, 557 to 1270
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1047 non-null   int64  
 1   survived   1047 non-null   int64  
 2   name       1047 non-null   object 
 3   sex        1047 non-null   object 
 4   age        836 non-null    float64
 5   sibsp      1047 non-null   int64  
 6   parch      1047 non-null   int64  
 7   ticket     1047 non-null   object 
 8   fare       1046 non-null   float64
 9   cabin      236 non-null    object 
 10  embarked   1045 non-null   object 
 11  boat       388 non-null    object 
 12  body       100 non-null    float64
 13  home.dest  601 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 122.7+ KB


In [30]:
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.impute import SimpleImputer

class AgeImputer(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        imputer=SimpleImputer(strategy="mean")
        X['age']=imputer.fit_transform(X[['age']])
        return X
        

In [31]:
from sklearn.preprocessing import OneHotEncoder

class FeatureEncoder(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        encoder=OneHotEncoder()
        matrix=encoder.fit_transform(X[['embarked']]).toarray()
        column_names=["C","S","Q","N"]
        for i in range(len(matrix.T)):
            X[column_names[i]]=matrix.T[i]
            
        matrix=encoder.fit_transform(X[['sex']]).toarray()
        column_names=["male","female"]
        for i in range(len(matrix.T)):
            X[column_names[i]]=matrix.T[i]
        return X

In [32]:
class FeatureDropper(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X.drop(["embarked","name","ticket","cabin","sex","N","boat","body","home.dest","fare"],axis=1,errors="ignore")

In [33]:
from sklearn.pipeline import Pipeline
pipeline=Pipeline([
    ("ageimputer",AgeImputer()),
    ("featureencoder",FeatureEncoder()),
    ("featuredropper",FeatureDropper()),
])

In [34]:
strat_train_set=pipeline.fit_transform(strat_train_set)

In [35]:
strat_train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1047 entries, 557 to 1270
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1047 non-null   int64  
 1   survived  1047 non-null   int64  
 2   age       1047 non-null   float64
 3   sibsp     1047 non-null   int64  
 4   parch     1047 non-null   int64  
 5   C         1047 non-null   float64
 6   S         1047 non-null   float64
 7   Q         1047 non-null   float64
 8   male      1047 non-null   float64
 9   female    1047 non-null   float64
dtypes: float64(6), int64(4)
memory usage: 90.0 KB


In [36]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

X=strat_train_set.drop(["survived"],axis=1)
y=strat_train_set["survived"]
X_data=scaler.fit_transform(X)
y_data=y.to_numpy()

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
clf=RandomForestClassifier()

param_grid=[
    {"n_estimators":[10,100,200,500],"max_depth":[None,5,10],"min_samples_split":[2,3,4]}
]

grid_search=GridSearchCV(clf,param_grid,cv=3,scoring="accuracy",return_train_score=True)
grid_search.fit(X_data,y_data)

In [38]:
final_clf=grid_search.best_estimator_

In [39]:
final_clf

In [40]:
strat_test_set=pipeline.fit_transform(strat_test_set)

In [41]:
strat_test_set

Unnamed: 0,pclass,survived,age,sibsp,parch,C,S,Q,male,female
618,3,0,35.000000,0,0,0.0,0.0,1.0,0.0,1.0
1035,3,1,30.879762,1,1,1.0,0.0,0.0,0.0,1.0
417,2,0,26.000000,0,0,0.0,0.0,1.0,0.0,1.0
999,3,1,30.879762,0,0,0.0,1.0,0.0,1.0,0.0
409,2,0,36.000000,0,0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
1275,3,0,16.000000,2,0,0.0,0.0,1.0,0.0,1.0
599,2,0,24.000000,0,0,0.0,0.0,1.0,1.0,0.0
16,1,0,24.000000,0,1,1.0,0.0,0.0,0.0,1.0
155,1,1,52.000000,1,1,0.0,0.0,1.0,1.0,0.0


In [42]:
X=strat_test_set.drop(["survived"],axis=1)
y=strat_test_set["survived"]
X_data_test=scaler.fit_transform(X)
y_data_test=y.to_numpy()

In [43]:
final_clf.score(X_data_test,y_data_test)

0.7938931297709924

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

In [45]:
final_data=pipeline.fit_transform(titanic_data)

In [46]:
final_data

Unnamed: 0,pclass,survived,age,sibsp,parch,C,S,Q,male,female
0,1,1,29.000000,0,0,0.0,0.0,1.0,1.0,0.0
1,1,1,0.916700,1,2,0.0,0.0,1.0,0.0,1.0
2,1,0,2.000000,1,2,0.0,0.0,1.0,1.0,0.0
3,1,0,30.000000,1,2,0.0,0.0,1.0,0.0,1.0
4,1,0,25.000000,1,2,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1304,3,0,14.500000,1,0,1.0,0.0,0.0,1.0,0.0
1305,3,0,29.881135,1,0,1.0,0.0,0.0,1.0,0.0
1306,3,0,26.500000,0,0,1.0,0.0,0.0,0.0,1.0
1307,3,0,27.000000,0,0,1.0,0.0,0.0,0.0,1.0


In [47]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   age       1309 non-null   float64
 3   sibsp     1309 non-null   int64  
 4   parch     1309 non-null   int64  
 5   C         1309 non-null   float64
 6   S         1309 non-null   float64
 7   Q         1309 non-null   float64
 8   male      1309 non-null   float64
 9   female    1309 non-null   float64
dtypes: float64(6), int64(4)
memory usage: 102.4 KB


In [48]:
scaler=StandardScaler()
X_final=final_data.drop(["survived"],axis=1)
y_final=final_data["survived"]
x_final_data=scaler.fit_transform(X_final)
y_final_data=y_final.to_numpy()

In [49]:
prod_clf=RandomForestClassifier()

param_grid=[
    {"n_estimators":[10,100,200,500],"max_depth":[None,5,10],"min_samples_split":[2,3,4]}
]

grid_search=GridSearchCV(prod_clf,param_grid,cv=3,scoring="accuracy",return_train_score=True)
grid_search.fit(x_final_data,y_final_data)

In [50]:
prod_final_clf=grid_search.best_estimator_

In [51]:
prod_final_clf

In [52]:
predictions=prod_final_clf.predict(x_final_data)

In [53]:
predictions

array([1, 1, 0, ..., 0, 0, 0], dtype=int64)

In [55]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_data_test, predictions)
recall = recall_score(y_data_test, predictions)

ValueError: Found input variables with inconsistent numbers of samples: [262, 1309]

In [37]:
final_df=pd.DataFrame(titanic_data["name"])
final_df["survived"]=predictions
final_df.to_csv("data/new_predictions.csv",index=False)

In [39]:
final_df.head()

Unnamed: 0,name,survived
0,"Allen, Miss. Elisabeth Walton",1
1,"Allison, Master. Hudson Trevor",1
2,"Allison, Miss. Helen Loraine",1
3,"Allison, Mr. Hudson Joshua Creighton",0
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",1
