#### Import

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
sns.set()
#%matplotlib inline


In [2]:
df = pd.read_csv("./train.csv")

#### Drop unnecessary columns

In [3]:
df = df.drop(columns=['Name', 'Ticket', 'PassengerId', 'Cabin', 'Embarked'])
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [4]:
df['Age'].mean()

29.69911764705882

In [5]:
df['Age'].median()

28.0

#### Encoding for categorical variables

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [7]:
df['Ageismissing'] = df['Age'].isnull()
df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Ageismissing
0,0,3,1,22.0,1,0,7.25,False
1,1,1,0,38.0,1,0,71.2833,False
2,1,3,0,26.0,0,0,7.925,False
3,1,1,0,35.0,1,0,53.1,False
4,0,3,1,35.0,0,0,8.05,False
5,0,3,1,,0,0,8.4583,True
6,0,1,1,54.0,0,0,51.8625,False
7,0,3,1,2.0,3,1,21.075,False
8,1,3,0,27.0,0,2,11.1333,False
9,1,2,0,14.0,1,0,30.0708,False


#### Impute with mean

In [8]:
df_imputed = df.copy(deep=True)
df_imputed['Age'] = df_imputed['Age'].fillna(df_imputed['Age'].mean())
df_imputed.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Ageismissing
0,0,3,1,22.0,1,0,7.25,False
1,1,1,0,38.0,1,0,71.2833,False
2,1,3,0,26.0,0,0,7.925,False
3,1,1,0,35.0,1,0,53.1,False
4,0,3,1,35.0,0,0,8.05,False
5,0,3,1,29.699118,0,0,8.4583,True
6,0,1,1,54.0,0,0,51.8625,False
7,0,3,1,2.0,3,1,21.075,False
8,1,3,0,27.0,0,2,11.1333,False
9,1,2,0,14.0,1,0,30.0708,False


#### Drop additional column for simple model

In [9]:
df_imputed = df_imputed.drop(columns=['Ageismissing'])

#### Create Y

In [10]:
y = df_imputed['Survived']
df_imputed = df_imputed.drop(columns=['Survived'])
df_imputed.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,22.0,1,0,7.25
1,1,0,38.0,1,0,71.2833
2,3,0,26.0,0,0,7.925
3,1,0,35.0,1,0,53.1
4,3,1,35.0,0,0,8.05
5,3,1,29.699118,0,0,8.4583
6,1,1,54.0,0,0,51.8625
7,3,1,2.0,3,1,21.075
8,3,0,27.0,0,2,11.1333
9,2,0,14.0,1,0,30.0708


#### First simple regression with simple model (without add column)

In [11]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(df_imputed,y,test_size=0.3)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
print(metrics.accuracy_score(pred,y_test))

0.7611940298507462


In [12]:
df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Ageismissing
0,0,3,1,22.0,1,0,7.25,False
1,1,1,0,38.0,1,0,71.2833,False
2,1,3,0,26.0,0,0,7.925,False
3,1,1,0,35.0,1,0,53.1,False
4,0,3,1,35.0,0,0,8.05,False
5,0,3,1,,0,0,8.4583,True
6,0,1,1,54.0,0,0,51.8625,False
7,0,3,1,2.0,3,1,21.075,False
8,1,3,0,27.0,0,2,11.1333,False
9,1,2,0,14.0,1,0,30.0708,False


#### Create Advanced Dataframe

In [13]:
df_advanced_model = df.copy(deep=True)
y1 = df_advanced_model['Survived']
df_advanced_model = df_advanced_model.drop(columns=['Survived'])
df_advanced_model.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Ageismissing
0,3,1,22.0,1,0,7.25,False
1,1,0,38.0,1,0,71.2833,False
2,3,0,26.0,0,0,7.925,False
3,1,0,35.0,1,0,53.1,False
4,3,1,35.0,0,0,8.05,False
5,3,1,,0,0,8.4583,True
6,1,1,54.0,0,0,51.8625,False
7,3,1,2.0,3,1,21.075,False
8,3,0,27.0,0,2,11.1333,False
9,2,0,14.0,1,0,30.0708,False


#### Median imputation FIXED[ERROR HERE] but where column in regression? and values change every time

In [14]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#data_new = my_imputer.fit_transform(df_advanced_model)
df_advanced_model['Age'] = my_imputer.fit_transform(df_advanced_model['Age'].array.reshape(-1, 1))
df_advanced_model.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Ageismissing
0,3,1,22.0,1,0,7.25,False
1,1,0,38.0,1,0,71.2833,False
2,3,0,26.0,0,0,7.925,False
3,1,0,35.0,1,0,53.1,False
4,3,1,35.0,0,0,8.05,False
5,3,1,29.699118,0,0,8.4583,True
6,1,1,54.0,0,0,51.8625,False
7,3,1,2.0,3,1,21.075,False
8,3,0,27.0,0,2,11.1333,False
9,2,0,14.0,1,0,30.0708,False


#### Second regression with original df (added column)

In [15]:
X_train, X_test,y_train,y_test = train_test_split(df_advanced_model,y1,test_size=0.3)
lr = LogisticRegression()
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
print(metrics.accuracy_score(pred,y_test))

0.8022388059701493


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
