In [141]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [122]:
train_data_og = pd.read_csv('./train.csv')

print(train_data_og.dtypes)

print(train_data_og.isnull().sum())

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [123]:
train_data = train_data_og.drop('Cabin',  axis=1)
print(train_data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64


In [124]:
train_data['Embarked'].fillna(method = 'pad', inplace=True)
print(train_data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         0
dtype: int64


## feature engineering:
* Age: randomly assign age for missing passangers 1 std dev from the mean.
* Embarked: convert it to numerical. this is an ordinal attribute.
* Ticket: drop it. not a good feature.
* Sex: convert to numerical nominal. 

In [125]:
train_data.drop('Name', axis=1, inplace=True)
train_data.drop('Ticket', axis=1, inplace=True)

age_mean = train_data['Age'].mean()
print(age_mean)

age_std = train_data['Age'].std()
print(age_std)

train_data['Age'] = train_data['Age'].fillna(value= np.random.uniform(age_mean-age_std, age_mean+age_std))
print(train_data.isnull().sum())

print( train_data['Age'].mean())
print(train_data['Age'].std())

train_data['Sex'] = train_data['Sex'].replace(to_replace='male', value='1')
train_data['Sex'] = train_data['Sex'].replace(to_replace='female', value='2') 

train_data['Embarked'] = train_data['Embarked'].replace(to_replace='C', value='1') 
train_data['Embarked'] = train_data['Embarked'].replace(to_replace='S', value='2') 
train_data['Embarked'] = train_data['Embarked'].replace(to_replace='Q', value='3') 



29.69911764705882
14.526497332334042
PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64
28.027338772208
13.429042646714695


Q. Why did the mean and std after fillna, not change?

In [126]:
train_data.sample(n=10).style

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
531,532,0,3,1,21.283553,0,0,7.2292,1
350,351,0,3,1,23.0,0,0,9.225,2
124,125,0,1,1,54.0,0,1,77.2875,2
614,615,0,3,1,35.0,0,0,8.05,2
487,488,0,1,1,58.0,0,0,29.7,1
652,653,0,3,1,21.0,0,0,8.4333,2
512,513,1,1,1,36.0,0,0,26.2875,2
581,582,1,1,2,39.0,1,1,110.8833,1
293,294,0,3,2,24.0,0,0,8.85,2
397,398,0,2,1,46.0,0,0,26.0,2


In [128]:
# Random Forest
x_train, x_test, y_train, y_test = train_test_split(train_data.drop(['Survived'],axis = 1), train_data['Survived'], test_size = 0.2)

rf = RandomForestClassifier(n_estimators=1000)
rf.fit(x_train, y_train)

predictions_rf = rf.predict(x_test)


errors = abs(predictions_rf-y_test)

print('MAE:', round(np.mean(errors),2))
print('RF Accuracy:', 100*round(accuracy_score(y_test, predictions_rf),2),'%')

MAE: 0.16
RF Accuracy: 84.0 %


In [132]:
# logistic regression

lr = LogisticRegression()
lr.fit(x_train,y_train)
predictions_lr = lr.predict(x_test)
print('LR Accuracy:', 100*round(accuracy_score(y_test, predictions_lr),2),'%')

LR Accuracy: 77.0 %


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [133]:
# KNN

knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
predictions_knn = knn.predict(x_test)
print('knn Accuracy:', 100*round(accuracy_score(y_test, predictions_rf),2),'%')

knn Accuracy: 84.0 %


Q. Why is accuracy 10% less than other submitted solutions for Random forest and decision trees?

### Train and test dataset should be combined to perform data cleaning and data wrangling together:


In [142]:
train_data_og = pd.read_csv('./train.csv')
test_data_og = pd.read_csv('./test.csv')

train_data = train_data_og
test_data = test_data_og.copy() #implement deepcopy ie. copy by value


combined_dataset = [train_data,test_data]

for dataset in combined_dataset:
    dataset.drop('Cabin',  axis=1, inplace=True)
    dataset.drop('Name', axis=1, inplace=True)
    dataset.drop('Ticket', axis=1, inplace=True)
    dataset.drop('PassengerId',axis = 1, inplace=True)

    dataset['Embarked'].fillna(method = 'pad', inplace=True)
    dataset['Fare'].fillna(method = 'pad', inplace=True)
    dataset['Sex'] = dataset['Sex'].replace(to_replace='male', value='1')
    dataset['Sex'] = dataset['Sex'].replace(to_replace='female', value='2') 

    dataset['Embarked'] = dataset['Embarked'].replace(to_replace='C', value='1') 
    dataset['Embarked'] = dataset['Embarked'].replace(to_replace='S', value='2') 
    dataset['Embarked'] = dataset['Embarked'].replace(to_replace='Q', value='3') 



For some reason, train_data_og has also been updated and is equal to combined_dataset[0]. Maybe it was copied by reference?

In [143]:
age_mean = train_data['Age'].mean()
age_std = train_data['Age'].std()

train_data['Age'] = train_data['Age'].fillna(value= np.random.uniform(age_mean-age_std, age_mean+age_std))
test_data['Age'] = test_data['Age'].fillna(value= np.random.uniform(age_mean-age_std, age_mean+age_std))

# x_train, x_test, y_train, y_test = train_test_split(train_data.drop(['Survived'],axis = 1), train_data['Survived'], test_size = 0.2)

train_features = train_data.drop(['Survived'],axis = 1)
train_labels = train_data['Survived']
test_features = test_data

    

In [154]:
rf = RandomForestClassifier(n_estimators=500)
# rf.fit(train_features, train_labels)
# rf.get_params()

knn = KNeighborsClassifier()
knn.fit(train_features, train_labels)

param_grid = {'n_estimators': [200,500,1000], 
            'max_depth': [4,5,6,7,8,9,10,11,12],
            # 'max_features': ['auto','sqrt','log2'],
            'criterion': ['gini', 'entropy', 'log_loss']}

CV_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv = 5)
CV_rf.fit(train_features, train_labels)

In [157]:
CV_rf.best_params_

rf_new = RandomForestClassifier(criterion= 'log_loss', max_depth= 10, n_estimators= 500)
rf_new.fit(train_features, train_labels)

In [158]:
# print(test_data.isnull().sum())
predict_labels = rf_new.predict(test_features)
# predict_labels = knn.predict(test_features)
# test_data_og.sample(n=3).style
# print(predict_labels.shape,test_data.shape, test_data_og["PassengerId"].shape)
result = pd.DataFrame({"PassengerId": test_data_og["PassengerId"],
                        "Survived":predict_labels
                        })
result.sample(n=5).style

Unnamed: 0,PassengerId,Survived
199,1091,0
137,1029,0
98,990,0
3,895,0
178,1070,1


In [159]:
result.to_csv('./final_submission.csv', index = False)