In [1]:
import pandas as pd
import numpy as np

In [2]:
data_frame = pd.read_csv("./train.csv")

In [3]:
data_frame.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**We are going to use the DecisionTree model; however, this model cannot work with strings, so we need to transform strings into float numbers using a "std" function down in code.**

In [4]:
data_frame.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
data_frame.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

**Because we have to much Cabin NaN values we will drop that column, this is not case with Age we are going to fill NaN values with median**

In [6]:
from sklearn.preprocessing import OrdinalEncoder

In [7]:
def std(data_frame):
    encoder = OrdinalEncoder()
    encoder.fit(data_frame[['Name', 'Sex', 'Cabin', 'Embarked', 'Ticket']])
    encoded_data = encoder.transform(data_frame[['Name', 'Sex', 'Cabin', 'Embarked', 'Ticket']])
    
    data = pd.DataFrame(encoded_data)
    data_frame['Name'] = data.iloc[:, 0]
    data_frame['Sex'] = data.iloc[:, 1]
    data_frame['Cabin'] = data.iloc[:, 2]
    data_frame['Embarked'] = data.iloc[:, 3]
    data_frame['Ticket'] = data.iloc[:, 4]
    
    age_mean = data_frame['Age'].mean()
    data_frame['Age'] = data_frame['Age'].fillna(age_mean)
    embark_mean = data_frame['Embarked'].mean()
    data_frame['Embarked'] = data_frame['Embarked'].fillna(embark_mean)

    
    data_frame = data_frame.drop(['Cabin', 'PassengerId'], axis=1) #EMB TODO
    #data_frame.dropna(inplace=True)
    
   # from sklearn.preprocessing import StandardScaler
   # scaler = StandardScaler()
   # data_frame['Name'] = scaler.fit_transform(data_frame[['Name']])
   # data_frame['Age'] = scaler.fit_transform(data_frame[['Age']])
   # data_frame['Ticket'] = scaler.fit_transform(data_frame[['Ticket']])
   # data_frame['Fare'] = scaler.fit_transform(data_frame[['Fare']])
    return data_frame

In [8]:
data_frame = std(data_frame)
data_frame

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,108.0,1.0,22.000000,1,0,523.0,7.2500,2.0
1,1,1,190.0,0.0,38.000000,1,0,596.0,71.2833,0.0
2,1,3,353.0,0.0,26.000000,0,0,669.0,7.9250,2.0
3,1,1,272.0,0.0,35.000000,1,0,49.0,53.1000,2.0
4,0,3,15.0,1.0,35.000000,0,0,472.0,8.0500,2.0
...,...,...,...,...,...,...,...,...,...,...
886,0,2,548.0,1.0,27.000000,0,0,101.0,13.0000,2.0
887,1,1,303.0,0.0,19.000000,0,0,14.0,30.0000,2.0
888,0,3,413.0,0.0,29.699118,1,2,675.0,23.4500,2.0
889,1,1,81.0,1.0,26.000000,0,0,8.0,30.0000,0.0


***We want to train and determine which hyperparameters are best for our model without peeking into the test set because we aim to prevent overfitting. Therefore, we will perform a train/test split to identify the optimal hyperparameters***

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = data_frame.drop('Survived', axis=1)

In [11]:
Y = data_frame['Survived']

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, train_size=0.75)

In [13]:
X_train.shape

(668, 9)

In [14]:
Y_train.shape

(668,)

In [15]:
X_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
173,3,752.0,1.0,21.0,0,0,659.0,7.925,2.0
572,1,251.0,1.0,36.0,0,0,578.0,26.3875,2.0
516,2,472.0,0.0,34.0,0,0,560.0,10.5,2.0
586,2,395.0,1.0,47.0,0,0,129.0,15.0,2.0
105,3,542.0,1.0,28.0,0,0,354.0,7.8958,2.0


In [16]:
X_train.describe()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
count,668.0,668.0,668.0,668.0,668.0,668.0,668.0,668.0,668.0
mean,2.330838,442.923653,0.648204,29.133203,0.489521,0.357784,342.741018,31.055619,1.539724
std,0.83575,259.463612,0.477889,12.818082,1.026578,0.750302,199.944281,49.753308,0.789452
min,1.0,0.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0
25%,2.0,215.75,0.0,21.75,0.0,0.0,164.0,7.8958,1.0
50%,3.0,447.5,1.0,29.699118,0.0,0.0,343.5,13.825,2.0
75%,3.0,666.25,1.0,35.0,1.0,0.0,519.25,30.5,2.0
max,3.0,890.0,1.0,74.0,8.0,5.0,680.0,512.3292,2.0


In [17]:
X_train.isna().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64

In [18]:
from sklearn.metrics import accuracy_score, f1_score

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [20]:
def decisionModel(X_train, Y_train, n):
    dmodel = DecisionTreeClassifier(max_depth=n)
    dmodel.fit(X_train, Y_train)
   # predict1 = dmodel.predict(X_test)
   # print(accuracy_score(Y_test, predict1), f1_score(Y_test, predict1))
    return dmodel

In [21]:
#dmodel = decisionModel(X, Y)

**After this for loop we will have the best depth for our tree**

In [22]:
n = 0
best_ac = 0
for i in range(1, 15):
    dmodel = decisionModel(X_train, Y_train, i)
    predict1 = dmodel.predict(X_test)
    if accuracy_score(Y_test, predict1) > best_ac:
        best_ac = accuracy_score(Y_test, predict1)
        n = i     
    print(accuracy_score(Y_test, predict1))

0.7982062780269058
0.7982062780269058
0.820627802690583
0.7757847533632287
0.7802690582959642
0.7892376681614349
0.7847533632286996
0.757847533632287
0.7802690582959642
0.7713004484304933
0.7443946188340808
0.7488789237668162
0.7443946188340808
0.7533632286995515


In [23]:
#from sklearn.linear_model import LinearRegression

In [24]:
#lmodel = LinearRegression()
#lmodel.fit(X_train, Y_train)
#predict = lmodel.predict(X_test)
#pmin = predict.min()
#pmax = predict.max()
#predict = (predict-pmin)/(pmax-pmin)
#predict = np.where(predict < 0.5 , 0 , 1)
#print(accuracy_score(Y_test, predict), f1_score(Y_test, predict))

**Now we will train our model on whole data with best parameters**

In [25]:
finalModel = decisionModel(X, Y, n)

In [26]:
test_df = pd.read_csv("./test.csv")
#print(test_df.head())
print(test_df.shape)
passId = test_df['PassengerId']
test_df = std(test_df)


(418, 11)


In [27]:
test_df

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,3,206.0,1.0,34.50000,0,0,152.0,7.8292,1.0
1,3,403.0,0.0,47.00000,1,0,221.0,7.0000,2.0
2,2,269.0,1.0,62.00000,0,0,73.0,9.6875,1.0
3,3,408.0,1.0,27.00000,0,0,147.0,8.6625,2.0
4,3,178.0,0.0,22.00000,1,1,138.0,12.2875,2.0
...,...,...,...,...,...,...,...,...,...
413,3,353.0,1.0,30.27259,0,0,267.0,8.0500,2.0
414,1,283.0,0.0,39.00000,0,0,324.0,108.9000,0.0
415,3,332.0,1.0,38.50000,0,0,346.0,7.2500,2.0
416,3,384.0,1.0,30.27259,0,0,220.0,8.0500,2.0


In [28]:
pr = finalModel.predict(test_df)

In [30]:
res_frame = pd.DataFrame(pr, columns= ['Survived'])
res_frame['PassengerId'] = passId
res_frame

Unnamed: 0,Survived,PassengerId
0,0,892
1,1,893
2,0,894
3,0,895
4,1,896
...,...,...
413,0,1305
414,1,1306
415,0,1307
416,0,1308


In [31]:
res = pd.read_csv("./gender_submission.csv")

In [32]:
res

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [35]:
accuracy_score(res.iloc[:,1], res_frame.iloc[:,0])

0.9712918660287081

***We got the 97% accuracy!***

In [34]:
res_frame.to_csv('res.csv', index=False)