## This notebook contains detailed analysis of Titanic dataset and then predics the survival of test data

In [231]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import cross_val_score


In [232]:
data=pd.read_csv("titanic/train.csv")
test_data=pd.read_csv("titanic/test.csv")

In [233]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

<p>Dropping Cabin columns as it has multiple null values</p>
Also fill the null age as average age and null embarked with mode of embarked

In [234]:
data.drop(columns=["Cabin"], inplace=True)
test_data.drop(columns=["Cabin"], inplace=True)

In [235]:
#data["Age"].fillna(data["Age"].mean(),inplace=True)
data.fillna({"Age":data["Age"].mean(),"Embarked":data["Embarked"].mode()[0]},inplace=True)

In [236]:
test_data.isnull().sum()

PassengerId     0
Pclass          0
Name            0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Fare            1
Embarked        0
dtype: int64

In [237]:
#test_data["Age"].fillna(data["Age"].mean(),inplace=True)
test_data.fillna({"Age":test_data["Age"].mean(),"Fare":test_data["Fare"].mean()},inplace=True)

Now that we have cleaned data we are ready to prepare our data for ML modeling.

In [238]:
data["Sex"]=data["Sex"].map({"male":0,"female":1})

In [239]:
test_data["Sex"]=test_data["Sex"].map({"male":0,"female":1})

In [240]:
data["Embarked"] = data["Embarked"].map({"S": 0, "C": 1, "Q": 2})
test_data["Embarked"] = test_data["Embarked"].map({"S": 0, "C": 1, "Q": 2})


In [241]:
data["Family Size"]=data["Parch"]+data["SibSp"]
test_data["Family Size"]=test_data["Parch"]+test_data["SibSp"]

In [242]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Family Size
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,1,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,0,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,0,0


In [243]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Family Size
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,2,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,0,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,2,0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,0,2


In [244]:
features = ["Pclass", "Sex", "Age", "Fare", "Embarked", "Family Size"]
x_train = data[features]
y_train = data["Survived"]
x_test = test_data[features]


In [245]:
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

In [246]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


In [247]:
model=LogisticRegression(max_iter=30000, solver="newton-cholesky")
model.fit(x_train_poly, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'newton-cholesky'
,max_iter,30000


In [248]:
x_test

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family Size
0,3,0,34.50000,7.8292,2,0
1,3,1,47.00000,7.0000,0,1
2,2,0,62.00000,9.6875,2,0
3,3,0,27.00000,8.6625,0,0
4,3,1,22.00000,12.2875,0,2
...,...,...,...,...,...,...
413,3,0,30.27259,8.0500,0,0
414,1,1,39.00000,108.9000,1,0
415,3,0,38.50000,7.2500,0,0
416,3,0,30.27259,8.0500,0,0


In [249]:
scores = cross_val_score(model, x_train_poly, y_train, cv=5)
print("CV Accuracy Scores:", scores)
print("Mean CV Accuracy:", scores.mean())

CV Accuracy Scores: [0.82122905 0.81460674 0.84831461 0.79775281 0.83707865]
Mean CV Accuracy: 0.8237963718536188


In [250]:
y_pred=model.predict(x_test_poly)

In [251]:
print(y_pred)

[0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 0
 1 0 0 1 0 1 1 1 0 0 0 0 1 1 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 0 0 1 0 0 0]


In [252]:
result=pd.DataFrame({
    "PassengerId":test_data["PassengerId"],
    "Survived":y_pred
})

In [253]:
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [254]:
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [255]:
result[['PassengerId', 'Survived']].to_csv('submission.csv', index=False)