In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
df=pd.read_csv('data/titanic.csv')

In [None]:
df.head(100)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,0,3,"Shorney, Mr. Charles Joseph",male,,0,0,374910,8.0500,,S
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
97,98,1,1,"Greenfield, Mr. William Bertram",male,23.0,0,1,PC 17759,63.3583,D10 D12,C
98,99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34.0,0,1,231919,23.0000,,S


In [None]:
df.shape

(891, 12)

In [None]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
print(df.dtypes)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [None]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
df['Cabin'].value_counts()

Cabin
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 147, dtype: int64

In [None]:
df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [None]:
df['Cabin Presence']= df['Cabin'].notna().astype(int)

In [None]:
df= df.drop('Cabin',axis=1)

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Cabin Presence
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0


In [None]:
df['Age']=df['Age'].fillna(df.groupby(['Sex','Pclass'])['Age'].transform('median'))

In [None]:
df['Embarked'] = df['Embarked'].fillna('S')

In [None]:
df = pd.get_dummies(df, columns=['Sex', 'Embarked'])
df[['Sex_female','Sex_male','Embarked_C','Embarked_Q','Embarked_S']]= df[['Sex_female','Sex_male','Embarked_C','Embarked_Q','Embarked_S']].astype(int)

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin Presence,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,0,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,0,1,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,1,1,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,0,0,1,0,0,1


In [None]:
df['Family_size']= df['SibSp']+ df['Parch']+ 1
df['IsAlone'] = (df['Family_size'] == 1).astype(int)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin Presence,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Family_size,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,0,0,1,0,0,1,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,1,0,1,0,0,2,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,0,1,0,0,0,1,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,1,1,0,0,0,1,2,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,0,0,1,0,0,1,1,1


In [None]:
df['IsAlone'].value_counts()

IsAlone
1    537
0    354
Name: count, dtype: int64

In [None]:
df.groupby(['Survived', 'IsAlone']).size()

Survived  IsAlone
0         0          175
          1          374
1         0          179
          1          163
dtype: int64

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin Presence,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Family_size,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,0,0,1,0,0,1,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,1,0,1,0,0,2,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,0,1,0,0,0,1,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,1,1,0,0,0,1,2,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,0,0,1,0,0,1,1,1


In [None]:
X= df.drop(['Survived'], axis=1)
y= df['Survived']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
Scaler = StandardScaler()
X_train_scaled= scaler.fit_transform(X_train)
X_test_scaled= scaler.transform(X_test)

In [None]:
model= LogisticRegression()
model.fit(X_train_scaled,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [None]:
predict= model.predict(X_test_scaled)
accuracy= accuracy_score(y_test,predict)
print(accuracy)

0.8100558659217877


In [None]:
recall_score= recall_score(y_test,predict)
print(recall_score)


0.7297297297297297


In [None]:
precision= precision_score(y_test,predict)
print(precision)

0.7941176470588235


In [None]:
cm= confusion_matrix(y_test,predict)
print(cm)

[[91 14]
 [20 54]]


In [None]:
skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
scores= cross_val_score(lg,X_train_scaled,y_train,cv=skf,scoring='accuracy')
print(scores)
print(scores.mean())
print(scores.std())

[0.78321678 0.82517483 0.82394366 0.76760563 0.8028169 ]
0.8005515611149414
0.022559059968630446


In [None]:
rf_model=RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled,y_train)
rf_pred=rf_model.predict(X_test_scaled)
rf_accuracy= accuracy_score(y_test,rf_pred)
print(rf_accuracy)

0.8212290502793296


In [None]:
skf_rf= StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
rf_scores= cross_val_score(rf_model, X_train_scaled,y_train,cv=skf_rf,scoring="accuracy")
print(rf_scores)
print(rf_scores.mean())
print(rf_scores.std())

[0.75524476 0.85314685 0.78873239 0.79577465 0.79577465]
0.7977346597064907
0.031510719563218964


In [None]:
#using pipeline
dp=pd.read_csv('data/titanic.csv')
X=dp.drop(["Survived"], axis=1)
y=dp["Survived"]
dp.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
X = dp.drop(['Name', 'Ticket', 'Cabin'], axis=1)
X_train_pip,X_test_pip,y_train_pip,y_test_pip=train_test_split(X,y,test_size=0.2,random_state=42)
num_features=['PassengerId','Pclass','Age', 'SibSp','Parch','Fare']
cat_features=['Sex','Embarked']
numeric_transformer= Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='median')),
  ('scaler', StandardScaler())
])

cat_transformer= Pipeline(steps=[
  ('imputer', SimpleImputer(strategy= 'most_frequent')),
  ('onehot', OneHotEncoder()),
])
preprocessor= ColumnTransformer(transformers= [
    ('num',numeric_transformer,num_features),
    ('cat',cat_transformer,cat_features)
],
                               remainder='drop')

model_pipeline= Pipeline(steps=[
    ('Preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=100,random_state=42))])

pip_skf= StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
pip_score= cross_val_score(model_pipeline,X_train_pip,y_train_pip,cv=pip_skf)
print(pip_score)
print(pip_score.mean())
print(pip_score.std())

[0.77622378 0.83916084 0.83098592 0.76056338 0.78169014]
0.7977248104008667
0.03138032051555506
