In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


SibSp = the number of siblings + spouse   
Parch = the nb of parents + children

In [4]:
dummies = pd.get_dummies(train.Sex)

In [5]:
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [6]:
train = pd.concat([train, dummies], axis = 1).drop('Sex', axis=1)

In [7]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,0,1


In [9]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'female', 'male'],
      dtype='object')

In [72]:
# use df.loc to avoid getting warnings further along the way
X_train = train.loc[:, ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'female', 'male']]
y_train = train.Survived

In [73]:
X_train.isna().sum()

Pclass      0
Age       177
SibSp       0
Parch       0
Fare        0
female      0
male        0
dtype: int64

In [74]:
X_train_age = X_train.Age.copy()

We need to replace the NaN values with the mean

In [75]:
meanAge = X_train_age.mean()
meanAge

29.69911764705882

In [76]:
X_train.loc[:, 'Age'].head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [77]:
X_train.Age.fillna(meanAge, inplace=True)

In [78]:
X_train.isna().sum()

Pclass    0
Age       0
SibSp     0
Parch     0
Fare      0
female    0
male      0
dtype: int64

In [79]:
# also check for NaN values along y
y_train.isna().sum()

0

In [80]:
# The diff btween regression and classification algos is that regression algos algos are used to predict continuous values such 
# as price, while classification algos are used to predict discrete values such as true/false
from sklearn.ensemble import RandomForestClassifier

In [81]:
# clf stands for classifier
# n_estimators defines the nb of trees in the forest. 100 is the default, but it was marked here for clarity
clf = RandomForestClassifier(n_estimators=100)

In [82]:
clf.fit(X_train, y_train)

In [83]:
if 'Sex' in test.columns:
    dummies = pd.get_dummies(test.Sex)
    test = pd.concat([test, dummies], axis = 1).drop('Sex', axis=1)
else:
    print('The Sex column has already been dropped')

The Sex column has already been dropped


In [84]:
X_test = test.loc[:, ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'female', 'male']]

In [85]:
X_test.isna().sum()

Pclass     0
Age       86
SibSp      0
Parch      0
Fare       1
female     0
male       0
dtype: int64

In [86]:
X_train_age.isna().sum()

177

In [93]:
pd.concat([X_train_age, X_test.Age]).isna().sum()

263

In [90]:
age_series = pd.concat([X_train.Age, X_test.Age])

In [91]:
age_series.mean()

29.85479462267327

In [94]:
X_test.Age.fillna(age_series.mean(), inplace=True)

We still need to fill the missing fare

In [99]:
mean_fare = pd.concat([X_test.Fare, X_train.Fare]).mean()
mean_fare

33.29547928134557

In [100]:
X_test.Fare.fillna(mean_fare, inplace=True)

Now we are ready to predict

In [101]:
y_pred = clf.predict(X_test)

In [102]:
y_pred

array([0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [103]:
pred_series = pd.Series(y_pred)
pred_series

0      0
1      0
2      1
3      1
4      0
      ..
413    0
414    1
415    0
416    0
417    0
Length: 418, dtype: int64

In [106]:
submission = pd.concat([test.PassengerId, pred_series], axis=1)
submission.columns = ['PassengerId', 'Survived']

In [107]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [108]:
submission.to_csv('RandForestSubmission.csv',index=False)

With this prediction, got a score of 0.73684

Now let's try to improve.   
We start by adding the place of embarcation

In [118]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,,S,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,,S,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,,S,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,,S,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,B42,S,1,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,W./C. 6607,23.4500,,S,1,0
889,890,1,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,C148,C,0,1


In [111]:
embarked_dummies = pd.get_dummies(train.Embarked)
embarked_dummies

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [113]:
embarked_dummies.isna().sum()

C    0
Q    0
S    0
dtype: int64

In [121]:
train = pd.concat([train, embarked_dummies], axis=1).drop('Embarked', axis=1)

In [122]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'female', 'male', 'C', 'Q', 'S'],
      dtype='object')

In [123]:
X_train = train.loc[:, ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'female', 'male', 'C', 'Q', 'S']]

In [125]:
X_train.isna().sum()

Pclass      0
Age       177
SibSp       0
Parch       0
Fare        0
female      0
male        0
C           0
Q           0
S           0
dtype: int64

In [127]:
X_train.fillna(meanAge, inplace=True)

In [128]:
clf.fit(X_train, y_train)

In [130]:
embarked_dummies = pd.get_dummies(test.Embarked)
embarked_dummies.isna().sum()

C    0
Q    0
S    0
dtype: int64

In [132]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
female           0
male             0
dtype: int64

In [134]:
test = pd.concat([test, embarked_dummies], axis=1).drop('Embarked', axis=1)

In [135]:
X_test = test.loc[:, ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'female', 'male', 'C', 'Q', 'S']].fillna(meanAge)
X_test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,3,34.500000,0,0,7.8292,0,1,0,1,0
1,3,47.000000,1,0,7.0000,1,0,0,0,1
2,2,62.000000,0,0,9.6875,0,1,0,1,0
3,3,27.000000,0,0,8.6625,0,1,0,0,1
4,3,22.000000,1,1,12.2875,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
413,3,29.699118,0,0,8.0500,0,1,0,0,1
414,1,39.000000,0,0,108.9000,1,0,1,0,0
415,3,38.500000,0,0,7.2500,0,1,0,0,1
416,3,29.699118,0,0,8.0500,0,1,0,0,1


In [141]:
predList = clf.predict(X_test)
predList

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [144]:
def convertPredToSub(predList=predList, subTitle='submission.csv'):
    predSeries = pd.Series(predList)
    submission = pd.concat([test.PassengerId, predSeries], axis=1)
    submission.columns = ['PassengerId','Survived']
    submission.to_csv(subTitle,index=False)
    print('Submission saved!')

In [145]:
convertPredToSub()

Submission saved!


This time we got a score of 0.7488, which is slightly better.

Now let's try playing with the parameters of the random forest to improve the score.

In [146]:
clf = RandomForestClassifier(n_estimators=1000)

In [149]:
clf.fit(X_train, y_train)
predList = clf.predict(X_test)

In [148]:
convertPredToSub()

Submission saved!


Changing the nb of trees didn't have any impact on the score.

In [150]:
clf = RandomForestClassifier(n_estimators=100, max_depth=5)
clf.fit(X_train, y_train)
predList = clf.predict(X_test)

In [151]:
convertPredToSub()

Submission saved!


Still no impact on score.

In [152]:
clf = RandomForestClassifier(n_estimators=100, min_samples_split=4)
clf.fit(X_train, y_train)
predList = clf.predict(X_test)
convertPredToSub()

Submission saved!


Same score again.

In [153]:
clf = RandomForestClassifier(n_estimators=100, oob_score=True)
clf.fit(X_train, y_train)
predList = clf.predict(X_test)
convertPredToSub()

Submission saved!


Score didn't budge.