In [5]:
from sklearn import preprocessing 
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import RandomForestRegressor

import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.options.mode.chained_assignment = None

In [6]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submit = pd.read_csv('gender_submission.csv')

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [31]:
data = train.append(test)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


In [32]:
data.reset_index(inplace=True, drop=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [35]:
data['Family_Size'] = data['Parch'] + data['SibSp']
data['Family_Size']

0       1
1       1
2       0
3       1
4       0
5       0
6       0
7       4
8       2
9       1
10      2
11      0
12      0
13      6
14      0
15      0
16      5
17      0
18      1
19      0
20      0
21      0
22      0
23      0
24      4
25      6
26      0
27      5
28      0
29      0
       ..
1279    0
1280    4
1281    0
1282    1
1283    2
1284    0
1285    4
1286    1
1287    0
1288    2
1289    0
1290    0
1291    0
1292    1
1293    1
1294    0
1295    1
1296    0
1297    1
1298    2
1299    0
1300    2
1301    0
1302    1
1303    0
1304    0
1305    0
1306    0
1307    0
1308    2
Name: Family_Size, Length: 1309, dtype: int64

In [36]:
data['Title1'] = data['Name'].str.split(", ", expand=True)[1]
data['Title1'] = data['Title1'].str.split(".", expand=True)[0]
data['Title2'] = data['Title1'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','the Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],
         ['Miss','Mrs','Miss','Mr','Mr','Mrs','Mrs','Mr','Mr','Mr','Mr','Mr','Mr','Mrs'])
data['Title2']

0           Mr
1          Mrs
2         Miss
3          Mrs
4           Mr
5           Mr
6           Mr
7       Master
8          Mrs
9          Mrs
10        Miss
11        Miss
12          Mr
13          Mr
14        Miss
15         Mrs
16      Master
17          Mr
18         Mrs
19         Mrs
20          Mr
21          Mr
22        Miss
23          Mr
24        Miss
25         Mrs
26          Mr
27          Mr
28        Miss
29          Mr
         ...  
1279        Mr
1280    Master
1281        Mr
1282       Mrs
1283    Master
1284        Mr
1285        Mr
1286       Mrs
1287        Mr
1288       Mrs
1289        Mr
1290        Mr
1291      Miss
1292        Mr
1293      Miss
1294        Mr
1295        Mr
1296        Mr
1297        Mr
1298        Mr
1299      Miss
1300      Miss
1301      Miss
1302       Mrs
1303      Miss
1304        Mr
1305       Mrs
1306        Mr
1307        Mr
1308    Master
Name: Title2, Length: 1309, dtype: object

In [12]:
data['Ticket']


0                A/5 21171
1                 PC 17599
2         STON/O2. 3101282
3                   113803
4                   373450
5                   330877
6                    17463
7                   349909
8                   347742
9                   237736
10                 PP 9549
11                  113783
12               A/5. 2151
13                  347082
14                  350406
15                  248706
16                  382652
17                  244373
18                  345763
19                    2649
20                  239865
21                  248698
22                  330923
23                  113788
24                  349909
25                  347077
26                    2631
27                   19950
28                  330959
29                  349216
               ...        
1279                364858
1280                349909
1281                 12749
1282              PC 17592
1283             C.A. 2673
1284            C.A. 30769
1

In [13]:
data['Ticket_info'] = data['Ticket'].apply(lambda x : x.replace(".","").replace("/","").strip().split(' ')[0] if not x.isdigit() else 'X')

In [14]:
data['Ticket_info'].unique()


array(['A5', 'PC', 'STONO2', 'X', 'PP', 'CA', 'SCParis', 'SCA4', 'A4',
       'SP', 'SOC', 'WC', 'SOTONOQ', 'WEP', 'STONO', 'C', 'SCPARIS', 'SOP',
       'Fa', 'LINE', 'FCC', 'SWPP', 'SCOW', 'PPP', 'SC', 'SCAH', 'AS',
       'SOPP', 'FC', 'SOTONO2', 'CASOTON', 'SCA3', 'STONOQ', 'AQ4', 'A',
       'LP', 'AQ3'], dtype=object)

In [15]:
data['Embarked'] = data['Embarked'].fillna('S')
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 16 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1309 non-null object
Fare           1309 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
Family_Size    1309 non-null int64
Title1         1309 non-null object
Title2         1309 non-null object
Ticket_info    1309 non-null object
dtypes: float64(3), int64(5), object(8)
memory usage: 163.7+ KB


In [17]:
data.describe()

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived,Family_Size
count,1046.0,1309.0,1309.0,1309.0,1309.0,1309.0,891.0,1309.0
mean,29.881138,33.295479,0.385027,655.0,2.294882,0.498854,0.383838,0.883881
std,14.413493,51.738879,0.86556,378.020061,0.837836,1.041658,0.486592,1.583639
min,0.17,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,21.0,7.8958,0.0,328.0,2.0,0.0,0.0,0.0
50%,28.0,14.4542,0.0,655.0,3.0,0.0,0.0,0.0
75%,39.0,31.275,0.0,982.0,3.0,1.0,1.0,1.0
max,80.0,512.3292,9.0,1309.0,3.0,8.0,1.0,10.0


In [18]:
data


Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Family_Size,Title1,Title2,Ticket_info
0,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,1,Mr,Mr,A5
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,1,Mrs,Mrs,PC
2,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,0,Miss,Miss,STONO2
3,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,1,Mrs,Mrs,X
4,35.0,,S,8.0500,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,0,Mr,Mr,X
5,,,Q,8.4583,"Moran, Mr. James",0,6,3,male,0,0.0,330877,0,Mr,Mr,X
6,54.0,E46,S,51.8625,"McCarthy, Mr. Timothy J",0,7,1,male,0,0.0,17463,0,Mr,Mr,X
7,2.0,,S,21.0750,"Palsson, Master. Gosta Leonard",1,8,3,male,3,0.0,349909,4,Master,Master,X
8,27.0,,S,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,female,0,1.0,347742,2,Mrs,Mrs,X
9,14.0,,C,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,female,1,1.0,237736,1,Mrs,Mrs,X


In [19]:
data["Cabin"] = data['Cabin'].apply(lambda x : str(x)[0] if not pd.isnull(x) else 'NoCabin')

In [20]:
data

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Family_Size,Title1,Title2,Ticket_info
0,22.0,NoCabin,S,7.2500,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,1,Mr,Mr,A5
1,38.0,C,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,1,Mrs,Mrs,PC
2,26.0,NoCabin,S,7.9250,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,0,Miss,Miss,STONO2
3,35.0,C,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,1,Mrs,Mrs,X
4,35.0,NoCabin,S,8.0500,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,0,Mr,Mr,X
5,,NoCabin,Q,8.4583,"Moran, Mr. James",0,6,3,male,0,0.0,330877,0,Mr,Mr,X
6,54.0,E,S,51.8625,"McCarthy, Mr. Timothy J",0,7,1,male,0,0.0,17463,0,Mr,Mr,X
7,2.0,NoCabin,S,21.0750,"Palsson, Master. Gosta Leonard",1,8,3,male,3,0.0,349909,4,Master,Master,X
8,27.0,NoCabin,S,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,female,0,1.0,347742,2,Mrs,Mrs,X
9,14.0,NoCabin,C,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,female,1,1.0,237736,1,Mrs,Mrs,X


In [21]:
data['Sex'] = data['Sex'].astype('category').cat.codes
data['Embarked'] = data['Embarked'].astype('category').cat.codes
data['Pclass'] = data['Pclass'].astype('category').cat.codes
data['Title1'] = data['Title1'].astype('category').cat.codes
data['Title2'] = data['Title2'].astype('category').cat.codes
data['Cabin'] = data['Cabin'].astype('category').cat.codes
data['Ticket_info'] = data['Ticket_info'].astype('category').cat.codes

In [22]:
data

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Family_Size,Title1,Title2,Ticket_info
0,22.0,7,2,7.2500,"Braund, Mr. Owen Harris",0,1,2,1,1,0.0,A/5 21171,1,12,2,2
1,38.0,2,0,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,0,0,1,1.0,PC 17599,1,13,3,14
2,26.0,7,2,7.9250,"Heikkinen, Miss. Laina",0,3,2,0,0,1.0,STON/O2. 3101282,0,9,1,31
3,35.0,2,2,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,0,0,1,1.0,113803,1,13,3,36
4,35.0,7,2,8.0500,"Allen, Mr. William Henry",0,5,2,1,0,0.0,373450,0,12,2,36
5,,7,1,8.4583,"Moran, Mr. James",0,6,2,1,0,0.0,330877,0,12,2,36
6,54.0,4,2,51.8625,"McCarthy, Mr. Timothy J",0,7,0,1,0,0.0,17463,0,12,2,36
7,2.0,7,2,21.0750,"Palsson, Master. Gosta Leonard",1,8,2,1,3,0.0,349909,4,8,0,36
8,27.0,7,2,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,2,0,0,1.0,347742,2,13,3,36
9,14.0,7,0,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,1,0,1,1.0,237736,1,13,3,36


In [23]:
dataAgeNull = data[data["Age"].isnull()]
dataAgeNotNull = data[data["Age"].notnull()]
remove_outlier = dataAgeNotNull[(np.abs(dataAgeNotNull["Fare"]-dataAgeNotNull["Fare"].mean())>(4*dataAgeNotNull["Fare"].std()))|
                      (np.abs(dataAgeNotNull["Family_Size"]-dataAgeNotNull["Family_Size"].mean())>(4*dataAgeNotNull["Family_Size"].std()))                     
                     ]
rfModel_age = RandomForestRegressor(n_estimators=2000,random_state=42)
ageColumns = ['Embarked', 'Fare', 'Pclass', 'Sex', 'Family_Size', 'Title1', 'Title2','Cabin','Ticket_info']
rfModel_age.fit(remove_outlier[ageColumns], remove_outlier["Age"])

ageNullValues = rfModel_age.predict(X= dataAgeNull[ageColumns])
dataAgeNull.loc[:,"Age"] = ageNullValues
data = dataAgeNull.append(dataAgeNotNull)
data.reset_index(inplace=True, drop=True)

In [24]:
data

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Family_Size,Title1,Title2,Ticket_info
0,41.326267,7,1,8.4583,"Moran, Mr. James",0,6,2,1,0,0.0,330877,0,12,2,36
1,41.616486,7,2,13.0000,"Williams, Mr. Charles Eugene",0,18,1,1,0,1.0,244373,0,12,2,36
2,46.792625,7,0,7.2250,"Masselmani, Mrs. Fatima",0,20,2,0,0,1.0,2649,0,13,3,36
3,41.326267,7,0,7.2250,"Emir, Mr. Farred Chehab",0,27,2,1,0,0.0,2631,0,12,2,36
4,34.860886,7,1,7.8792,"O'Dwyer, Miss. Ellen ""Nellie""",0,29,2,0,0,1.0,330959,0,9,1,36
5,39.428653,7,2,7.8958,"Todoroff, Mr. Lalio",0,30,2,1,0,0.0,349216,0,12,2,36
6,49.286267,1,0,146.5208,"Spencer, Mrs. William Augustus (Marie Eugenie)",0,32,0,0,1,1.0,PC 17569,1,13,3,14
7,34.860886,7,1,7.7500,"Glynn, Miss. Mary Agatha",0,33,2,0,0,1.0,335677,0,9,1,36
8,41.326267,7,0,7.2292,"Mamee, Mr. Hanna",0,37,2,1,0,1.0,2677,0,12,2,36
9,41.326267,7,0,7.8958,"Kraeff, Mr. Theodor",0,43,2,1,0,0.0,349253,0,12,2,36


In [25]:
dataTrain = data[pd.notnull(data['Survived'])].sort_values(by=["PassengerId"])
dataTest = data[~pd.notnull(data['Survived'])].sort_values(by=["PassengerId"])

In [26]:
dataTrain = dataTrain[['Survived', 'Age', 'Embarked', 'Fare',  'Pclass', 'Sex', 'Family_Size', 'Title2','Ticket_info','Cabin']]
dataTest = dataTest[['Age', 'Embarked', 'Fare', 'Pclass', 'Sex', 'Family_Size', 'Title2','Ticket_info','Cabin']]

In [27]:
from sklearn.ensemble import RandomForestClassifier
 
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=2000,
                             min_samples_split=35,
                             min_samples_leaf=2,
                             oob_score=True,
                            max_depth = 9,
                             random_state=1,
                             n_jobs=-1) 

rf.fit(dataTrain.iloc[:, 1:], dataTrain.iloc[:, 0])
print("%.4f" % rf.oob_score_)

0.8260


In [None]:
pd.concat((pd.DataFrame(dataTrain.iloc[:, 1:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

In [30]:
param_test1= {'n_estimators':range(500,1001,100)}  
gsearch1= GridSearchCV(estimator = RandomForestClassifier(min_samples_split=50,  
                                 min_samples_leaf=2,max_features='sqrt' ,random_state=1),  
                       param_grid =param_test1, scoring='roc_auc',cv=5)  
gsearch1.fit(dataTrain.iloc[:, 1:], dataTrain.iloc[:, 0])  
gsearch1.grid_scores_,gsearch1.best_params_, gsearch1.best_score_ 



([mean: 0.87747, std: 0.02771, params: {'n_estimators': 500},
  mean: 0.87726, std: 0.02783, params: {'n_estimators': 600},
  mean: 0.87726, std: 0.02812, params: {'n_estimators': 700},
  mean: 0.87679, std: 0.02847, params: {'n_estimators': 800},
  mean: 0.87700, std: 0.02883, params: {'n_estimators': 900},
  mean: 0.87711, std: 0.02855, params: {'n_estimators': 1000}],
 {'n_estimators': 500},
 0.87747336085491334)

In [None]:
rf_res =  rf.predict(dataTest)
submit['Survived'] = rf_res
submit['Survived'] = submit['Survived'].astype(int)
submit.to_csv('submit.csv', index= False)

In [None]:
param_test2= {'max_depth':range(3,14,2), 'min_samples_split':range(50,201,20)}  
gsearch2= GridSearchCV(estimator = RandomForestClassifier(n_estimators= 500,  
                                 min_samples_leaf=2,max_features='sqrt' ,oob_score=True,random_state=1),  
   param_grid = param_test2,scoring='roc_auc',iid=False, cv=5)  
gsearch2.fit(dataTrain.iloc[:, 1:], dataTrain.iloc[:, 0])  
gsearch2.grid_scores_,gsearch2.best_params_, gsearch2.best_score_  

In [None]:
param_test3= {'min_samples_split':range(50,150,20), 'min_samples_leaf':range(2,40,5)}  
gsearch3= GridSearchCV(estimator = RandomForestClassifier(n_estimators= 500,max_depth=9,  
                                 max_features='sqrt' ,oob_score=True, random_state=1),  
   param_grid = param_test3,scoring='roc_auc',iid=False, cv=5)  
gsearch3.fit(dataTrain.iloc[:, 1:], dataTrain.iloc[:, 0])  
gsearch3.grid_scores_,gsearch2.best_params_, gsearch2.best_score_  

In [None]:
param_test4= {'max_features':range(3,20,2)}  
gsearch4= GridSearchCV(estimator = RandomForestClassifier(n_estimators= 500,max_depth=9, min_samples_split=50,  
                                 min_samples_leaf=2 ,oob_score=True, random_state=1),  
   param_grid = param_test4,scoring='roc_auc',iid=False, cv=5)  
gsearch4.fit(dataTrain.iloc[:, 1:], dataTrain.iloc[:, 0])  
gsearch4.grid_scores_,gsearch4.best_params_, gsearch4.best_score_  

In [None]:
num_folds = 7
num_repeats = 5

# Use a stratified k-fold for generating train-test splits
skf = StratifiedKFold(n_splits=num_folds, shuffle=True)

# Use a random forest classifier as second-level estimator

acc_scores = []
f1_scores = []

for i in range(num_repeats):
    for train_idx, test_idx in skf.split(X_train, y_train):
        X_train_cv = X_train[train_idx]
        X_test_cv = X_train[test_idx]

        y_train_cv = y_train[train_idx]
        y_test_cv = y_train[test_idx]

        
        clf.fit(X_train_cv, y_train_cv)
        y_pred_cv = clf.predict(X_test_cv)

        acc_scores.append(accuracy_score(y_test_cv, y_pred_cv))
        f1_scores.append(f1_score(y_test_cv, y_pred_cv))
    
acc_scores_mean = np.mean(acc_scores)
acc_scores_std = np.std(acc_scores)

f1_scores_mean = np.mean(f1_scores)
f1_scores_std = np.std(f1_scores)

print(f'CV summary for {num_repeats} repeats on {skf.n_splits} splits:')
print(f'accuracy score: {acc_scores_mean:.3f} +/- {acc_scores_std:.3f}')
print(f'f1 score:       {f1_scores_mean:.3f} +/- {f1_scores_std:.3f}')