In [71]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston

from sklearn.datasets import fetch_openml

# Load the Titanic dataset
titanic_data = fetch_openml('titanic', version=1, as_frame=True)

# Access the data as a pandas DataFrame
X = titanic_data['data']
y = titanic_data['target']

# Display the first few rows of the dataset
# print(X.head())
# print(y.head())

In [72]:
train = pd.DataFrame(titanic_data['data'], columns = titanic_data.feature_names)
train.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [73]:
train['Survived'] = y
train.reset_index()
train['Passengerid'] = train.reset_index()['index']
train.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,Survived,Passengerid
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO",1,0
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON",1,1
2,1.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0,2
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",0,3
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0,4


In [74]:
#checking for missing data
NAS = pd.concat([train.isnull().sum()], axis = 1, keys = ['Train'])
NAS

Unnamed: 0,Train
pclass,0
name,0
sex,0
age,263
sibsp,0
parch,0
ticket,0
fare,1
cabin,1014
embarked,2


In [75]:
NAS[NAS.sum(axis = 1)>0]

Unnamed: 0,Train
age,263
fare,1
cabin,1014
embarked,2
boat,823
body,1188
home.dest,564


In [76]:
train['age']=train['age'].fillna(train['age'].mean())
train['fare']=train['fare'].fillna(train['fare'].mean())

train['embarked']=train['embarked'].fillna(train['embarked'].mode()[0]) #most common value
train['cabin'] = train['cabin'].fillna(train['cabin'].mode()[0]) #most common value
train['boat'] = train['boat'].fillna(train['boat'].mode()[0]) #most common value
train['body'] = train['body'].fillna(train['body'].mode()[0]) #most common value
train['home.dest'] = train['home.dest'].fillna(train['home.dest'].mode()[0]) #most common value


In [77]:
NAs = pd.concat([train.isnull().sum()], axis=1, keys = ['Train'])
NAs[NAs.sum(axis=1)>0]

Unnamed: 0,Train


In [81]:
train['pclass'] = train['pclass'].apply(str)
train['sex'] = train['sex'].astype('object')
train['embarked'] = train['embarked'].astype('object')

train.dtypes

pclass           object
name             object
sex              object
age             float64
sibsp           float64
parch           float64
ticket           object
fare            float64
cabin            object
embarked         object
boat             object
body            float64
home.dest        object
Survived       category
Passengerid       int64
dtype: object

In [82]:
# Getting Dummies for all other categorical vars
for col in train.dtypes[train.dtypes == 'object'].index:
    for_dummy = train.pop(col)
    train = pd.concat([train, pd.get_dummies(for_dummy, prefix = col)], axis = 1)
train.head()

Unnamed: 0,age,sibsp,parch,fare,body,Survived,Passengerid,pclass_1.0,pclass_2.0,pclass_3.0,...,"home.dest_Wimbledon Park, London / Hayling Island, Hants","home.dest_Windsor, England New York, NY","home.dest_Winnipeg, MB","home.dest_Winnipeg, MN","home.dest_Woodford County, KY","home.dest_Worcester, England","home.dest_Worcester, MA","home.dest_Yoevil, England / Cottage Grove, OR","home.dest_Youngstown, OH","home.dest_Zurich, Switzerland"
0,29.0,0.0,0.0,211.3375,1.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.9167,1.0,2.0,151.55,1.0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2.0,1.0,2.0,151.55,1.0,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,30.0,1.0,2.0,151.55,135.0,0,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,25.0,1.0,2.0,151.55,1.0,0,4,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
#labels = train.pop('Survived')
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train, labels, test_size = 0.25)

In [84]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

RandomForestClassifier()

In [87]:
y_pred = rf.predict(x_test)

In [95]:
len(np.array(y_test))
#len(y_pred)

328

In [100]:
y_test = y_test.astype(int)

In [106]:
y_pred = y_pred.astype(int)

In [107]:
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred.astype(int))
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.9974747474747474

## Ada-Boost

In [110]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [115]:
x_train, x_test, y_train, y_test = train_test_split(train, labels, test_size = 0.2)

In [116]:
AdaModel = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)

In [117]:
model = AdaModel.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [119]:
print('Accuracy:', metrics.accuracy_score(y_test,y_pred))

Accuracy: 1.0


In [121]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test.astype(int), y_pred.astype(int))
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

1.0

In [128]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()

adabc = AdaBoostClassifier(n_estimators = 150, base_estimator = lg, learning_rate = 1)

In [129]:
model = adabc.fit(x_train, y_train)

In [130]:
y_pred = model.predict(x_test)

In [131]:
print('Accuracy:', metrics.accuracy_score(y_test,y_pred))

Accuracy: 0.9770992366412213


In [132]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test.astype(int), y_pred.astype(int))
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.9716981132075472