In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import scipy, math

%matplotlib inline

In [7]:
raw_data = pd.read_csv("train.csv", low_memory=False)

In [4]:
# joining two datasets together for further data processing
datasets = {'train': train, 'test': test}
data = pd.concat(datasets)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
raw_data.shape

(891, 12)

In [10]:
raw_data.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05


In [11]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [12]:
raw_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
# encode categorical values
cat = [column for column in data.columns if data[column].dtypes == 'object']
cat

['Embarked', 'Sex']

In [11]:
data = pd.get_dummies(data, prefix=["Sex", "Embarked"], columns=["Sex", "Embarked"])

In [12]:
data.head().T

Unnamed: 0_level_0,train,train,train,train,train
Unnamed: 0_level_1,0,1,2,3,4
Age,22.0,38.0,26.0,35.0,35.0
Fare,7.25,71.2833,7.925,53.1,8.05
Parch,0.0,0.0,0.0,0.0,0.0
PassengerId,1.0,2.0,3.0,4.0,5.0
Pclass,3.0,1.0,3.0,1.0,3.0
SibSp,1.0,1.0,0.0,1.0,0.0
Survived,0.0,1.0,1.0,1.0,0.0
Sex_female,0.0,1.0,1.0,1.0,0.0
Sex_male,1.0,0.0,0.0,0.0,1.0
Embarked_C,0.0,1.0,0.0,0.0,0.0


In [13]:
# finding out some info about NaN values
data.isnull().any()

Age             True
Fare            True
Parch          False
PassengerId    False
Pclass         False
SibSp          False
Survived        True
Sex_female     False
Sex_male       False
Embarked_C     False
Embarked_Q     False
Embarked_S     False
dtype: bool

#### There surely are NaN's in test set where Survived feature should be in test. So, I will not take them into an account. 

In [14]:
# replacing missing values with mean
for column in data.columns:
    if pd.isna(data[column]).any() and column != "Survived":
        data[column].fillna(data[column].mean(), inplace=True)

In [15]:
data.isnull().any()

Age            False
Fare           False
Parch          False
PassengerId    False
Pclass         False
SibSp          False
Survived        True
Sex_female     False
Sex_male       False
Embarked_C     False
Embarked_Q     False
Embarked_S     False
dtype: bool

In [16]:
# splitting train data and test data
train_data = data.loc['train']
test_X = pd.DataFrame(data.loc['test'].drop(["Survived"], axis=1))

In [20]:
test_X.shape

(418, 11)

In [18]:
def split_val(a, n): return a[:n], a[n:]

In [21]:
# splitting all of the train data into train and validation sets, then into main X features and target variable
n_valid = 200 # the size of our validation set
n_trn = len(train_data) - n_valid
train_raw, valid_raw = split_val(train_data, n_trn)
train_X, valid_X = train_raw.drop(["Survived"], axis=1), valid_raw.drop(["Survived"], axis=1)
train_y, valid_y = train_raw["Survived"], valid_raw["Survived"]
train_X.shape, train_y.shape, valid_X.shape, valid_y.shape

((691, 11), (691,), (200, 11), (200,))

### Base model

In [22]:
def rmse(x, y):
    return math.sqrt(((x - y) ** 2).mean())

In [23]:
def print_score(m):
    res = [rmse(m.predict(train_X), train_y), rmse(m.predict(valid_X), valid_y),
                m.score(train_X, train_y), m.score(valid_X, valid_y)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [24]:
# bare RandomForest without tuning parameters
from sklearn.ensemble import RandomForestClassifier 
m = RandomForestClassifier(n_jobs=-1)
%time m.fit(train_X, train_y)
print_score(m)



CPU times: user 36.7 ms, sys: 11.9 ms, total: 48.6 ms
Wall time: 197 ms
[0.13178063603857842, 0.41231056256176607, 0.9826338639652678, 0.83]


#### Now let's try to improve our RandomForest model and analyze it by tuning the hyperparameters such as oob_score, min_samples_leaf and n_estimators

In [45]:
m = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True, min_samples_leaf=3, max_features='sqrt')
m.fit(train_X, train_y)
print_score(m)

[0.32054584415571397, 0.39370039370059057, 0.8972503617945007, 0.845, 0.8205499276410999]


In [39]:
predicted = m.predict(test_X)

In [24]:
type(train_x["PassengerId"])

pandas.core.series.Series

In [126]:
result = pd.merge(pd.DataFrame(train_x["PassengerId"]), predicted, right_index=True, left_index=True)
result.rename(columns={0: 'Survived'}, inplace=True)
result

Unnamed: 0,PassengerId,Survived
0,1,0.0
1,2,0.0
2,3,0.0
3,4,0.0
4,5,1.0
...,...,...
413,414,0.0
414,415,1.0
415,416,0.0
416,417,0.0


In [138]:
result.to_csv("Result.csv", index=False)

In [140]:
pred = pd.read_csv("Result.csv")
pred.head()

Unnamed: 0,PassengerId,Survived
0,1,0.0
1,2,0.0
2,3,0.0
3,4,0.0
4,5,1.0
