### Import/Install Dependencies

In [3]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 99.8/99.8 MB 7.7 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
import sklearn.metrics
import xgboost as xgb

In [None]:
# pandas config

### Read data/ data summary

In [8]:
data = pd.read_csv('C:\\Users\\User-Kate\\Downloads\\Titanic-Dataset.csv')
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [9]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
data['Survived'].value_counts() #class imbalance?

0    549
1    342
Name: Survived, dtype: int64

### Data Preprocessing

In [None]:
#duplicate, fillna/dropna, normalize, split, one-hot-encoding

In [12]:
data.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

In [13]:
embarked = pd.get_dummies(data['Embarked'])

In [14]:
scaler = MinMaxScaler(feature_range=(-1, 1))
norm = pd.DataFrame(scaler.fit_transform(data[['Age', 'Fare']]), data.index, columns=['norm_Age', 'norm_Fare'])

In [15]:
data = pd.concat([data.drop(columns=['Embarked', 'Age', 'Fare']), embarked, norm], axis=1)
data['norm_Age'] = data['norm_Age'].fillna(data['norm_Age'].median())
data['Sex'] = np.vectorize(lambda x: True if x == 'male' else False)(data['Sex'])
data.set_index(['PassengerId'], inplace=True)
data.sort_index(inplace=True)
data

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,C,Q,S,norm_Age,norm_Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,True,1,0,0,0,1,-0.457653,-0.971698
2,1,1,False,1,0,1,0,0,-0.055542,-0.721729
3,1,3,False,0,0,0,0,1,-0.357125,-0.969063
4,1,1,False,1,0,0,0,1,-0.130937,-0.792711
5,0,3,True,0,0,0,0,1,-0.130937,-0.968575
...,...,...,...,...,...,...,...,...,...,...
887,0,2,True,0,0,0,0,1,-0.331993,-0.949251
888,1,1,False,0,0,0,0,1,-0.533049,-0.882888
889,0,3,False,1,2,0,0,1,-0.306861,-0.908457
890,1,1,True,0,0,1,0,0,-0.357125,-0.882888


In [16]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Survived   891 non-null    int64  
 1   Pclass     891 non-null    int64  
 2   Sex        891 non-null    bool   
 3   SibSp      891 non-null    int64  
 4   Parch      891 non-null    int64  
 5   C          891 non-null    uint8  
 6   Q          891 non-null    uint8  
 7   S          891 non-null    uint8  
 8   norm_Age   891 non-null    float64
 9   norm_Fare  891 non-null    float64
dtypes: bool(1), float64(2), int64(4), uint8(3)
memory usage: 52.2 KB


In [None]:
# Kfoldcv, as we are not doing splitting

### Classification Model 1 - SVM?

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix


# Define features and target variable
X = data[['Pclass', 'Sex', 'SibSp', 'Parch', 'C', 'Q', 'S', 'norm_Age', 'norm_Fare']]
y = data['Survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create the SVM model
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7821229050279329


### Classification Model 2 - RandomForest/Gradient Tree Boosting?

### Result/Analysis/Visualization