In [None]:
import warnings
warnings.filterwarnings('ignore')


import scipy as sp
import pandas as pd
import re
import numpy as np


# Data visualization
import matplotlib as mlt
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Scalers
from sklearn.preprocessing import StandardScaler

#discritization
from sklearn.preprocessing import LabelEncoder

#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier

#model selection
from sklearn.model_selection import GridSearchCV

#evaluation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import metrics




In [None]:
train = pd.read_csv (r'../input/titanic/train.csv')
test = pd.read_csv (r'../input/titanic/test.csv')
train.head(15)

# check missing data
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    types = [str(data[i].dtype) for i in data.columns]
    df = pd.DataFrame({'Total':total, 'Precent':percent, 'Types':types})
    return(sp.transpose(df))

missing_data(train)
missing_data(test)



**Describe dataset** 
* 11 features contains categorical and continuous and ordinal features.
* Passengerid may be irralevant
* need extract information from categorical features like "Name","Ticket","Cabin"
* categorical & ordinal features like "Embarked","Sex"need discritization


**Handle missing values** 

We notice "Age","Fare" ,"Embarked", and "Cabin" has missing values, we first need to handle the missing values.

*  "Age" and "Fare"(continuous)

We construct densiity plot to see its distribution.The ditribution of "Fare" is right skewed, hence we use median imputation.

* "Cabin" and "Embarked" (categorical)

Simply impute "no values" to resolve "Cabin" missing values. For "Embarked", we use the most frequent value imputation.

In [None]:
#missing value imputation
y_train = train['Survived']
X_train = train.drop(columns='Survived')
data = pd.concat([X_train, test])
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data['Cabin'].fillna('no value', inplace=True)
data['Sex'] = data['Sex'].apply(lambda x: 1 if x == 'male' else 0)
data["FamilySize"] = data["SibSp"] + data["Parch"] + 1

regex = "([A-Za-z]+)\."
def get_title(row):
    match = re.search(regex, str(row))
    title = match.group(0);
    return title
data['Title'] = data.Name.apply(lambda x: get_title(x))
data['Title'] .value_counts()


In [None]:
#https://www.kaggle.com/code/shuvojitdas/predictive-data-analysis-on-titanic-dataset?kernelSessionId=98672795
data['Title'] = data['Title'].replace('Mlle.','Miss.')
data['Title'] = data['Title'].replace('Ms.','Miss.')  
data['Title'] = data['Title'].replace('Mme.','Mrs.')
data['Title'] = data['Title'].replace(['Capt.','Col.','Major.'],'Army.')
data['Title'] = data['Title'].replace(['Countess.','Don.','Jonkheer.','Lady.','Sir.'],'Noble.')
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch','Embarked'], axis = 1)# drop unwanted columns that do not contribute to survivability

#split the train test dataset for traindataset analysis
X_train = data.loc[:X_train.index[-1]]
X_test = data.loc[X_train.index[-1]:][1:]
train = pd.concat([y_train, X_train], axis=1)
X_train

**Anlysis the train dataset**

* seems like sex, pclass and fare is more correlated with survival
* Embark doesn't seems very helpful
* children seems have higher survivality -> need to add a binary feature 
* people with titles(noble) 

In [None]:
#https://www.kaggle.com/code/shuvojitdas/predictive-data-analysis-on-titanic-dataset?kernelSessionId=98672795
print('Total number of passangers who survivded : ', len(train[train['Survived'] == 1]))
print('Total number of passangers who died : ', len(train[train['Survived'] == 0]))

print('Total percentage of male passangers who survivded : ', 100*np.mean(train['Survived'][train['Sex'] == 1]))
print('Total percentage of female passangers who survivded : ', 100*np.mean(train['Survived'][train['Sex'] == 0]))

print('Total percentage of passangers who survivded from first class : ', 100*np.mean(train['Survived'][train['Pclass'] == 1]))
print('Total percentage of passangers who survivded from second class : ', 100*np.mean(train['Survived'][train['Pclass'] == 2]))
print('Total percentage of passangers who survivded from third class : ', 100*np.mean(train['Survived'][train['Pclass'] == 3]))

print('Percentage of average survival:\n\n{}\n'.format(train.groupby('Title')['Survived'].mean()*100))
#print('Percentage of average survival:\n\n{}\n'.format(train.groupby('Embarked')['Survived'].mean()*100))
print('Percentage of average survival:\n\n{}\n'.format(train.groupby('Title')['Survived'].mean()*100))

for i in train:
    plt.figure(figsize=(13,7))
    sns.histplot(data = train, x=i, kde=True, hue = 'Survived', multiple='stack')
    plt.title(i)
    plt.show()
plt.figure(figsize = (10, 8))
sns.heatmap(train.corr(), annot = True)

**Feature engineering**

In [None]:
# add a new column indicating adult or not
def children(df): 
    children = [];
    for i in range(len(df['Age'])):
        X = df['Age'].iloc[i];
        if(X>=18):
            children.append(0);
        else:
            children.append(1);
    df['children'] = children;
    return df
children(data)

In [None]:
#discretization
le = LabelEncoder()
data["Title"] = le.fit_transform(data["Title"])
data

In [None]:
#scaler 
sc = StandardScaler()
train_len = X_train.index[-1]
X_train = data.loc[:train_len]
sc =sc.fit(X_train[X_train.columns], y_train)
data[data.columns] = sc.transform(data[data.columns])

X_train = data.loc[:train_len]
X_test = data.loc[train_len:][1:]
train = pd.concat([y_train, X_train], axis=1)
train

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.28, random_state = 42)

#scaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
#RF mdoel
RFmodel = RandomForestClassifier(n_estimators=100, max_depth=len(train.columns)-1, random_state=1)
RFmodel.fit(X_train, y_train);
skf = StratifiedKFold(n_splits=10)
results = cross_val_score(RFmodel, X_val, y_val, cv=skf)

print("train accuracy: \n ", metrics.accuracy_score(y_train, RFmodel.predict(X_train)))
print("cross validation accuracy:  \n",results)
print("Avg cross validation accuracy:  \n",np.mean(results))
plot_confusion_matrix(RFmodel, X_val, y_val)  

In [None]:
#logistic regression
from sklearn import svm
LRmodel= LogisticRegression()
LRmodel.fit(X_train, y_train)
skf = StratifiedKFold(n_splits=10)
results = cross_val_score(LRmodel, X_val, y_val, cv=skf)
print("train accuracy: \n ", metrics.accuracy_score(y_train, LRmodel.predict(X_train)))
print("cross validation accuracy:  \n",results)
print("Avg cross validation accuracy:  \n",np.mean(results))
plot_confusion_matrix(LRmodel, X_val, y_val) 


In [None]:
# Gradient Boosting
gadient_boosting = GradientBoostingClassifier()
gadient_boosting.fit(X_train, y_train)
skf = StratifiedKFold(n_splits=10)
results = cross_val_score(gadient_boosting, X_val, y_val, cv=skf)

print("train accuracy: \n ", metrics.accuracy_score(y_train, gadient_boosting.predict(X_train)))
print("cross validation accuracy:  \n",results)
print("Avg cross validation accuracy:  \n",np.mean(results))
plot_confusion_matrix(gadient_boosting, X_val, y_val)  


In [None]:
#maybe not to use stacking since no improve in result
'''from sklearn.ensemble import StackingClassifier
estimators = [('rf',RFmodel), ('lr', LRmodel),('gradient_boosting',gadient_boosting)]
clf = StackingClassifier(estimators=estimators, final_estimator=RFmodel)
clf.fit(X_train, y_train).score(X_val, y_val)'''

In [None]:
y_test = RFmodel.predict(X_test)
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_test})
output.head()
output.to_csv('./submission.csv', index=False)