# Exploration and Preprocessing

![image.png](attachment:image.png)

In [72]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

from xgboost import XGBClassifier

In [2]:
training_df = pd.read_csv("./titanic/train.csv")

print(f'Number of datapoints: {len(training_df)} \nNumber of features: {len(training_df.columns)}\n')
print('-----------------------------')
print('Feature datatypes')
print('-----------------------------')
print(training_df.dtypes)
print('-----------------------------\n')

training_df.head()

Number of datapoints: 891 
Number of features: 12

-----------------------------
Feature datatypes
-----------------------------
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
-----------------------------



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Get a count of the nulls here
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


So Age and Cabin have nulls

In [54]:
testing_df = pd.read_csv("./titanic/test.csv")

print(f'Number of datapoints: {len(testing_df)} \nNumber of features: {len(testing_df.columns)}\n')
print('-----------------------------')
print('Feature datatypes')
print('-----------------------------')
print(testing_df.dtypes)
print('-----------------------------\n')

testing_df.head()

Number of datapoints: 418 
Number of features: 11

-----------------------------
Feature datatypes
-----------------------------
PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
-----------------------------



Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [49]:
# Get a count of the nulls here
testing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [6]:
print(f'fare mean: {training_df["Fare"].mean()} | min: {training_df["Fare"].min()} | max: {training_df["Fare"].max()}')

fare mean: 32.204207968574636 | min: 0.0 | max: 512.3292


In [32]:
def create_submission(classifier, X, y, X_test):
    classifier.fit(X, y)
    y_pred = classifier.predict(X_test)

    output = pd.DataFrame({'PassengerId': testing_df.PassengerId, 'Survived': y_pred})
    output.to_csv('submission.csv', index=False)
    print("Your submission was successfully saved!")

In [46]:
def evaluate_classifier(classifier, X, y, iterations, text=''):
    accs = []
    for i in range(iterations):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i, stratify=y)
        
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        accs += [accuracy_score(y_test, y_pred)]

    print(f'{text:<20s} acc={np.mean(accs):.3f} with {iterations} iterations')

In [60]:
# features_to_use = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
features_to_use = ["Pclass", "Sex", "Fare"]
# features_to_use = ["Pclass", "Sex", "SibSp", "Parch"]

# numerical_data = training_df[["Fare"]].values

# Replace the original numerical feature with the scaled values in the DataFrame
# training_df["Fare"] = StandardScaler().fit_transform(numerical_data)

X = pd.get_dummies(training_df[features_to_use])
y = training_df["Survived"]

X_test = pd.get_dummies(testing_df[features_to_use])

print(X_test.isna().any())

# X_test.fillna(X_test["Fare"].mean(), inplace=True)

print(X_test.isna().any())

create_submission(GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=1), X, y, X_test)

Pclass        False
Fare           True
Sex_female    False
Sex_male      False
dtype: bool
Pclass        False
Fare           True
Sex_female    False
Sex_male      False
dtype: bool


ValueError: Input X contains NaN.
GradientBoostingClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [77]:
evaluate_classifier(svm.SVC(class_weight='balanced', kernel='linear', C=2), X, y, 10, 'SVM (linear)')
evaluate_classifier(svm.SVC(class_weight='balanced', kernel='rbf', gamma=2, C=2), X, y, 10, 'SVM (rbf)')

evaluate_classifier(XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic', random_state=1), X, y, 10, 'XGBoost')
evaluate_classifier(GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=1), X, y, 10, 'GradientBoosting')
evaluate_classifier(HistGradientBoostingClassifier(random_state=1), X, y, 10, 'HistGradientBoosting')

evaluate_classifier(GaussianNB(), X, y, 10, 'GaussianNB')

evaluate_classifier(KNeighborsClassifier(n_neighbors=3), X, y, 10, 'KNeighbors')

evaluate_classifier(AdaBoostClassifier(n_estimators=100, random_state=1), X, y, 10, 'AdaBoost')

evaluate_classifier(ExtraTreesClassifier(n_estimators=100, random_state=1), X, y, 10, 'ExtraTrees')
evaluate_classifier(DecisionTreeClassifier(random_state=1), X, y, 10, 'DecisionTree')
evaluate_classifier(RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1), X, y, 10, 'RandomForest')

SVM (linear)         acc=0.791 with 10 iterations
SVM (rbf)            acc=0.790 with 10 iterations
XGBoost              acc=0.784 with 10 iterations
GradientBoosting     acc=0.835 with 10 iterations
HistGradientBoosting acc=0.827 with 10 iterations
GaussianNB           acc=0.785 with 10 iterations


TypeError: KNeighborsClassifier.__init__() got an unexpected keyword argument 'random_state'