In [71]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [55]:
train_data_og = pd.read_csv('./train.csv')

print(train_data_og.dtypes)

print(train_data_og.isnull().sum())

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [56]:
train_data = train_data_og.drop('Cabin',  axis=1)
print(train_data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64


In [57]:
train_data['Embarked'].fillna(method = 'pad', inplace=True)
print(train_data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         0
dtype: int64


## feature engineering:
* Age: randomly assign age for missing passangers 1 std dev from the mean.
* Embarked: convert it to numerical. this is an ordinal attribute.
* Ticket: drop it. not a good feature.
* Sex: convert to numerical nominal. 

In [58]:
train_data.drop('Name', axis=1, inplace=True)
train_data.drop('Ticket', axis=1, inplace=True)

age_mean = train_data['Age'].mean()
print(age_mean)

age_std = train_data['Age'].std()
print(age_std)

train_data['Age'] = train_data['Age'].fillna(value= np.random.uniform(age_mean-age_std, age_mean+age_std))
print(train_data.isnull().sum())

print( train_data['Age'].mean())
print(train_data['Age'].std())

train_data['Sex'] = train_data['Sex'].replace(to_replace='male', value='1')
train_data['Sex'] = train_data['Sex'].replace(to_replace='female', value='2') 

train_data['Embarked'] = train_data['Embarked'].replace(to_replace='C', value='1') 
train_data['Embarked'] = train_data['Embarked'].replace(to_replace='S', value='2') 
train_data['Embarked'] = train_data['Embarked'].replace(to_replace='Q', value='3') 



29.69911764705882
14.526497332334042
PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64
30.284625710422993
13.055146630075003


Q. Why did the mean and std after fillna, not change?

In [59]:
train_data.sample(n=10).style

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
99,100,0,2,1,34.0,1,0,26.0,2
704,705,0,3,1,26.0,1,0,7.8542,2
803,804,1,3,1,0.42,0,1,8.5167,1
524,525,0,3,1,32.646506,0,0,7.2292,1
583,584,0,1,1,36.0,0,0,40.125,1
49,50,0,3,2,18.0,1,0,17.8,2
554,555,1,3,2,22.0,0,0,7.775,2
266,267,0,3,1,16.0,4,1,39.6875,2
316,317,1,2,2,24.0,1,0,26.0,2
406,407,0,3,1,51.0,0,0,7.75,2


In [60]:
x_train, x_test, y_train, y_test = train_test_split(train_data.drop(['Survived'],axis = 1), train_data['Survived'], test_size = 0.2)

rf = RandomForestClassifier(n_estimators=1000)
rf.fit(x_train, y_train)


In [73]:
predictions = rf.predict(x_test)
# print(y_test)
# print(predictions)

errors = abs(predictions-y_test)
# print(errors)
print('MAE:', round(np.mean(errors),2))

# accuracy = 100 - np.mean(100 * (errors/y_test))
# accuracy = 100* np.mean(abs(predictions-errors))
print('Accuracy:', 100*round(accuracy_score(y_test, predictions),2),'%')

MAE: 0.24
Accuracy: 76.0 %


Q. Why is accuracy 10% less than other submitted solutions for Random forest and decision trees?

### Train and test dataset should be combined to perform data cleaning and data wrangling together:


In [92]:
train_data_og = pd.read_csv('./train.csv')
test_data_og = pd.read_csv('./test.csv')

train_data = train_data_og
test_data = test_data_og


combined_dataset = [train_data,test_data]

for dataset in combined_dataset:
    dataset.drop('Cabin',  axis=1, inplace=True)
    dataset.drop('Name', axis=1, inplace=True)
    dataset.drop('Ticket', axis=1, inplace=True)
    dataset.drop('PassengerId',axis = 1, inplace=True)

    dataset['Embarked'].fillna(method = 'pad', inplace=True)
    dataset['Sex'] = dataset['Sex'].replace(to_replace='male', value='1')
    dataset['Sex'] = dataset['Sex'].replace(to_replace='female', value='2') 

    dataset['Embarked'] = dataset['Embarked'].replace(to_replace='C', value='1') 
    dataset['Embarked'] = dataset['Embarked'].replace(to_replace='S', value='2') 
    dataset['Embarked'] = dataset['Embarked'].replace(to_replace='Q', value='3') 



For some reason, train_data_og has also been updated and is equal to combined_dataset[0]. Maybe it was copied by reference?

In [93]:
age_mean = train_data['Age'].mean()
age_std = train_data['Age'].std()

train_data['Age'] = train_data['Age'].fillna(value= np.random.uniform(age_mean-age_std, age_mean+age_std))
test_data['Age'] = test_data['Age'].fillna(value= np.random.uniform(age_mean-age_std, age_mean+age_std))

# x_train, x_test, y_train, y_test = train_test_split(train_data.drop(['Survived'],axis = 1), train_data['Survived'], test_size = 0.2)

train_features = train_data.drop(['Survived'],axis = 1)
train_labels = train_data['Survived']
test_features = test_data

rf = RandomForestClassifier(n_estimators=500)
rf.fit(train_features, train_labels)

    

In [97]:
print(test_data.)
# predict_labels = rf.predict(test_features)
# print(test_data.sampele(n=3).style)
# result = pd.DataFrame({"PassengerId": test_data_og["PassengerId"],
#                         "Survived":predictions
                        # })

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
204,2,1,25.0,0,0,10.5,2
266,1,1,19.299097,0,0,0.0,2
91,3,1,19.299097,0,0,7.775,2
