In [117]:
import numpy as np # library for numerical computing in python
import pandas as pd # A library for data manipulation and analysis (data-frames)


# Scikit-learn : opensource ML-library for python that simplifies the implementation of various ML and
# Data Modeling Task. Designed for both supervised and unsupervised learning, 
# it provides tools for classification, regression, clustering, and more.

from sklearn.model_selection import train_test_split # split a dataset into training and testing subeset
from sklearn.impute import SimpleImputer # Handles missing data  in dataset
from sklearn.preprocessing import OneHotEncoder # convert categorical data into binary(format suitable for ml)
from sklearn.preprocessing import MinMaxScaler # scales features to a specific range, typically between 0 & 1
from sklearn.tree import DecisionTreeClassifier # implements decision tree algorithm for classification task

In [118]:
df = pd.read_csv("train.csv")

In [119]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [120]:
df.drop(columns= ["PassengerId","Name", "Ticket", "Cabin"], inplace= True)
# drop the columns

In [121]:
df.head()
# first 5 rows

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S


In [122]:
#step 1 -> train/test/split
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(columns = ['Survived']),
                                                   df['Survived'], test_size = 0.2,
                                                    random_state = 42
                                                   )

# X = includes all columns of dF except Survived column(Y - target-variable)
# test-size = 20% of the data is reserved for the testing remaining 80% will be used for trainin model
# random-state = It keeps the training and testing datasets the same across multiple runs.

In [123]:
X_train.isnull().sum()

Pclass       0
Sex          0
Age         72
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [124]:
Y_train.head()

336    0
31     0
84     0
287    0
317    0
Name: Survived, dtype: int64

In [125]:
df.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [138]:
#applying imputer (filling missing values)
si_age = SimpleImputer(strategy='mean')
si_fare = SimpleImputer(strategy='mean')

X_train_age = si_age.fit_transform(X_train[['Age']])
X_train_fare = si_fare.fit_transform(X_train[['Fare']])

# Apply imputer on test data (ONLY transform)
X_test_age = si_age.transform(X_test[['Age']])
X_test_fare = si_fare.transform(X_test[['Fare']])



In [139]:
X_train_fare.shape

(334, 1)

In [140]:
# Initialize OneHotEncoder 
ohe_sex = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
ohe_embarked = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

X_train_sex = ohe_sex.fit_transform(X_train[["Sex"]])
X_train_embarked = ohe_embarked.fit_transform(X_train[["Embarked"]]) 

X_test_sex = ohe_sex.fit_transform(X_test[["Sex"]])
X_test_embarked = ohe_embarked.fit_transform(X_test[["Embarked"]]) 



In [144]:
print(X_train_sex.shape)
print(X_train_embarked.shape)
print(X_train_age.shape)
print(X_train_fare.shape)

(334, 2)
(334, 3)
(334, 1)
(334, 1)


In [148]:
X_train_rem = X_train.drop(columns=["Sex", "Age", "Embarked"])
X_test_rem = X_test.drop(columns=["Sex", "Age", "Embarked"])



In [149]:
X_test_rem.shape

(84, 4)

In [150]:
import numpy as np

# Concatenate the arrays along axis=1
# np.concatenate() joins multiple NumPy arrays along a specified axis.

X_train_transform = np.concatenate((X_train_rem, X_train_age, X_train_sex, X_train_embarked), axis=1)

# If you also have test data, do the same for test set
X_test_transform = np.concatenate((X_test_rem, X_test_age, X_test_sex, X_test_embarked), axis=1)


In [152]:
X_test_transform.shape

(84, 10)

In [153]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transform, Y_train)

In [155]:
Y_pred = clf.predict(X_test_transform)
Y_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1], dtype=int64)

In [157]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred)

1.0

In [158]:
import pickle

In [None]:
pickle.dump(ohe_sex, open('models/ohe_sex.pkl','wb'))
pickle.dump(ohe_embarked, open('models/ohe_embarked.pkl','wb'))
pickle.dump(clf, open('models/clf.pkl','wb'))