# 1. Libraries for data cleaning and predicion modeling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm
import statsmodels.api as sm2
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# 2. Train - test data

In [2]:
train=pd.read_csv("/kaggle/input/titanic/train.csv")
test=pd.read_csv("/kaggle/input/titanic/test.csv")

After the initial loading of information, the data related to categories as stated in the guidelines is transformed:
1. Survived
2. Pclass
3. Sex
4. Embarked

In [3]:
train["Survived"]=train["Survived"].astype("category") # Related to survival or not
train["Pclass"]=pd.Categorical(train["Pclass"], categories=[3,2,1], ordered=True) # Ordered related to status SES
train["Sex"]=train["Sex"].astype("category")
train["Embarked"].fillna("S",inplace=True)
train["Embarked"]=train["Embarked"].astype("category")

In [4]:
test["Pclass"]=pd.Categorical(test["Pclass"], categories=[3,2,1], ordered=True) # Ordered related to status SES
test["Sex"]=test["Sex"].astype("category")
test["Embarked"].fillna("S",inplace=True)
test["Embarked"]=test["Embarked"].astype("category")

# 3. Prediction for filling NaN Ages through logistical regression

In previous implementation it was stated that there are multiple NaN values in Age in both train and test databases, for this reason, it was used a simple Linear Regression to replace these missing values:

In [5]:
#Prediction references
train_age=train[train["Age"].isna()]
train_predict=train_age[["Pclass","Survived", "SibSp"]]
test_age=test[test["Age"].isna()]
test_predict_2=test_age[["Pclass", "SibSp"]]
#Copy of database without NaN values to train the model
train_stat=train.copy()
train_stat.drop(columns="Cabin",inplace=True)
train_stat.dropna(inplace=True)

In [6]:
#Linear regression modeling with survive as an additional reference - Train model
model_predict = LinearRegression()
model_predict.fit(train_stat[["Pclass","Survived", "SibSp"]],train_stat["Age"])
age_predict=model_predict.predict(train_predict)
#Linear regression modeling of test dataset
model_predict_2 = LinearRegression()
model_predict_2.fit(train_stat[["Pclass", "SibSp"]],train_stat["Age"])
age_predict_2=model_predict_2.predict(test_predict_2)

Then with these results, the missing values are filled using `.loc` of each of the datasets

In [7]:
nan_rows=list(train_predict.index)
nan_rows_2=list(test_predict_2.index)

In [8]:
count=0
for i in nan_rows:
    train.loc[i,"Age"]=int(abs(age_predict[count]))
    count=count+1
count=0
for i in nan_rows_2:
    test.loc[i,"Age"]=int(abs(age_predict_2[count]))
    count=count+1

# 4. Initial implementation with random forest

As an initial insight, it was used the random forest modeling with a `StandardScaler` 

In [9]:
#Training with the following variables
train_cols=["Pclass","SibSp","Age","Sex","Parch"]
train_mod=train[train_cols]
test_mod=test[train_cols]
train_num=pd.get_dummies(train_mod)
test_num=pd.get_dummies(test_mod)
train_y=train["Survived"]
model = make_pipeline(StandardScaler(),RandomForestRegressor(max_depth=25,n_estimators=100,n_jobs=-1),verbose=True).fit(train_num,train_y)

[Pipeline] .... (step 1 of 2) Processing standardscaler, total=   0.0s
[Pipeline]  (step 2 of 2) Processing randomforestregressor, total=   0.3s


Then, the example results are presented:

In [10]:
model.predict(test_num)[:10]

array([0.00833333, 0.02      , 0.55      , 0.75011905, 0.44315909,
       0.30727453, 0.41371429, 0.01      , 0.52003571, 0.11527249])

However, to obtain the desired output it was necessary to use 0.5 as a reference value for classifying the data either with One or Zero:

In [11]:
y_fin=list(model.predict(test_num))
#Categorization by using 0.5 as threshold
cat_result=[]
for i in range(len(y_fin)):
    if y_fin[i] <= 0.5:
        cat_result.append(0)
    else:
        cat_result.append(1)
cat_result[:10]

[0, 0, 1, 1, 0, 0, 0, 0, 1, 0]

In [12]:
test["Survived"]=cat_result

# 5. Adjustment for correct formatting

Finally, it was compared to the recommended answer output and transformed to be adequate to the competition guidelines:

In [13]:
pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [14]:
test[["PassengerId","Survived"]]

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,0
...,...,...
413,1305,1
414,1306,1
415,1307,0
416,1308,1


In [15]:
#Output for checking
Final_prediction=test[["PassengerId","Survived"]]
#Final_prediction.to_csv("submission.csv",index=False)

# 6. Adding of an extra variable

In this case, the Fare variable would be added to the model to test if it can improve the Random Forest accuracy and before changing to other model.

In [16]:
#Training with the following variables
train_cols=["Pclass","SibSp","Age","Sex","Parch","Fare"]
train_mod=train[train_cols]
test_mod=test[train_cols]
train_num=pd.get_dummies(train_mod)
test_num=pd.get_dummies(test_mod)
train_y=train["Survived"]
model = make_pipeline(StandardScaler(),RandomForestRegressor(max_depth=200,n_estimators=100,n_jobs=-1),verbose=True).fit(train_num,train_y)

[Pipeline] .... (step 1 of 2) Processing standardscaler, total=   0.0s
[Pipeline]  (step 2 of 2) Processing randomforestregressor, total=   0.3s


In [17]:
#Handling missing value of Fare of test data
missing_val=test_num[test_num["Fare"].isna()].index
test_num.loc[missing_val[0],"Fare"]=test_num["Fare"].mean()

In [18]:
model.predict(test_num)[:10]

array([0.14 , 0.04 , 0.5  , 0.68 , 0.42 , 0.205, 0.04 , 0.07 , 0.92 ,
       0.07 ])

In [19]:
y_fin=list(model.predict(test_num))
#Categorization by using 0.5 as threshold
cat_result=[]
for i in range(len(y_fin)):
    if y_fin[i] <= 0.5:
        cat_result.append(0)
    else:
        cat_result.append(1)
cat_result[:10]

[0, 0, 0, 1, 0, 0, 0, 0, 1, 0]

In [20]:
test["Survived"]=cat_result
#Output for checking
Final_prediction=test[["PassengerId","Survived"]]
Final_prediction.to_csv("submission.csv",index=False)

With the extra variable, the model increased to 73.2% of accuracy. Now, it will the tested XGBoost and different transformation values

# 7. XGboost and extra analysis

In this part the same code will be used changing the regressor to `XGBRegressor`

In [21]:
#Training with the following variables
train_cols=["Pclass","SibSp","Age","Sex","Parch","Fare"]
train_mod=train[train_cols]
test_mod=test[train_cols]
train_num=pd.get_dummies(train_mod)
test_num=pd.get_dummies(test_mod)
train_y=train["Survived"]
#Handling missing value of Fare of test data
missing_val=test_num[test_num["Fare"].isna()].index
test_num.loc[missing_val[0],"Fare"]=test_num["Fare"].mean()
model = make_pipeline(StandardScaler(),XGBRegressor(max_depth=100,n_estimators=200,n_jobs=-1),verbose=True).fit(train_num,train_y)

[Pipeline] .... (step 1 of 2) Processing standardscaler, total=   0.0s
[Pipeline] ...... (step 2 of 2) Processing xgbregressor, total=   1.0s


In [22]:
model.predict(test_num)[:10]

array([ 1.9871229e-01, -2.5262913e-02,  8.0714327e-01,  9.9687487e-01,
        5.9669137e-01,  1.3787173e-02,  1.1914261e-02, -1.3190627e-02,
        1.0392245e+00, -7.0593302e-04], dtype=float32)

In [23]:
y_fin=list(model.predict(test_num))
#Categorization by using 0.5 as threshold
cat_result=[]
for i in range(len(y_fin)):
    if y_fin[i] <= 0.5:
        cat_result.append(0)
    else:
        cat_result.append(1)
cat_result[:10]

[0, 0, 1, 1, 1, 0, 0, 0, 1, 0]

In [24]:
test["Survived"]=cat_result
#Output for checking
Final_prediction=test[["PassengerId","Survived"]]
#Final_prediction.to_csv("submission.csv",index=False)

As identified, the random forest is a better option that the XGBoost (Last test of 0.6866), but further variables will be added to see if there is an improvement.

# 7.1. Logistic Regression
As an aditional tool of analysis and prediction it was built an Logistic Regression model, beginnig with the identification of the significant variables, being only four:

In [25]:
#Training with the following variables
train_cols=["Pclass","SibSp","Age","Sex","Embarked","Fare"] #Fare
train_mod=train[train_cols]
test_mod=test[train_cols]
train_num=pd.get_dummies(train_mod,drop_first=True)
test_num=pd.get_dummies(test_mod,drop_first=True)
train_y=train["Survived"]
logit_model=sm2.Logit(train_y,train_num)
logit_model.fit().summary2()

Optimization terminated successfully.
         Current function value: 0.454757
         Iterations 6


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.317
Dependent Variable:,Survived,AIC:,826.3777
Date:,2022-07-28 01:36,BIC:,864.7165
No. Observations:,891,Log-Likelihood:,-405.19
Df Model:,7,LL-Null:,-593.33
Df Residuals:,883,LLR p-value:,2.9027000000000003e-77
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
SibSp,-0.2193,0.0917,-2.3923,0.0167,-0.3989,-0.0396
Age,-0.0228,0.0064,-3.5573,0.0004,-0.0354,-0.0102
Fare,0.0048,0.0025,1.9363,0.0528,-0.0001,0.0096
Pclass_2,1.4729,0.2306,6.3877,0.0000,1.0209,1.9248
Pclass_1,2.3639,0.3049,7.7538,0.0000,1.7664,2.9614
Sex_male,-2.2843,0.1849,-12.3539,0.0000,-2.6468,-1.9219
Embarked_Q,1.1010,0.3252,3.3852,0.0007,0.4635,1.7384
Embarked_S,0.4709,0.2054,2.2923,0.0219,0.0683,0.8736


With this consideration a simple Logistic Regression model was built

In [26]:
#Handling missing value of Fare of test data
try:
    missing_val=test_num[test_num["Fare"].isna()].index
    test_num.loc[missing_val[0],"Fare"]=test_num["Fare"].mean()
except:
    pass
log_imp= LogisticRegression(max_iter=300)
log_imp.fit(train_num,train_y)
y_pred = log_imp.predict(test_num)

In [27]:
test["Survived"]=y_pred
#Output for checking
Final_prediction=test[["PassengerId","Survived"]]
Final_prediction.to_csv("submission.csv",index=False)

With the initial Logistic Regression the percentage goes to the highest.