In [361]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier

In [362]:
def ml_fit_eval(X_train,y_train,X_test,y_test, model,parameters,modelname):
    
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    model = GridSearchCV(model, parameters, cv=10,verbose=2)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print(modelname ,'Optimal Parameters :       ', model.cv_results_ ['params'] [np.argmin(model.cv_results_['rank_test_score'])])
    print(modelname ,'Cross Validation Accuracy: ', round(model.cv_results_ ['mean_test_score'] [np.argmin(model.cv_results_['rank_test_score'])]*100,2), '%', '\n')

    print(modelname ,'Accuracy:  ', round(accuracy_score(y_test, y_pred) * 100,4) , '%')
    print(modelname ,'Precision: ', round(precision_score(y_test,y_pred) *100,4), '%')
    print(modelname ,'Recall:    ', round(recall_score(y_test,y_pred) *100,4), '%')      
    print(modelname ,'F1-Score:  ', round(f1_score(y_test,y_pred),4), '\n')  

    sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, annot_kws={'fontsize':14,'fontweight':'bold'},fmt='d')
    plt.title(modelname +' Confusion Matrix')
    plt.xlabel('Prediction')
    plt.ylabel('Reality')
    plt.xticks([0.5,1.5], labels=['Dead', 'Survived'])
    plt.yticks([0.5,1.5], labels=['Dead', 'Survived'], va='center')
    plt.show()

In [363]:
cwd = os.getcwd()
cwd

'/Users/joaoeira/Downloads'

In [364]:
datadir = '../Documents/GitHub/kaggle-titanic/titanic/'

trainpath = datadir + 'train.csv'
testpath = datadir + 'test.csv'
#fullpath = datadir + 'fullnamelist.csv'

In [365]:
train = pd.read_csv(trainpath)
test = pd.read_csv(testpath)

In [366]:
train['Deck']=train.Cabin.str.slice(0, 1)
train.loc[train.Deck.isna(),'Deck'] = 'U'

In [367]:
train.Sex = train.Sex.replace(['female','male'],[0,1])
train.Embarked = train.Embarked.replace(['S','C','Q'],[0,1,2])
train.Embarked = train.Embarked.fillna('S')
train.Cabin.fillna('U', inplace=True)
train['Deck'] = train.Cabin.str.slice(0,1)
train.Deck = train.Deck.replace(['U','C','E','G','D','A','B','F','T'],[0,1,2,3,4,5,6,7,8])

In [368]:
train.Deck.unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [369]:
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

train['Title'] = train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')
 
train['Title'] = train['Title'].map(titles)
train['Title'] = train['Title'].fillna(0)

In [370]:
for value in train.Title.unique():
    df = train[train.Title==value]
    mean = df['Age'].mean()
    std = df['Age'].std()
    is_null = df['Age'].isnull().sum()
    
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)

    # fill NaN values in Age column with random values generated
    age_slice = df['Age'].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    df['Age'] = age_slice
    df['Age'] = df['Age'].astype(int)
    train.loc[train.Title==value,'Age'] = df.Age

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [371]:
test.Sex = test.Sex.replace(['female','male'],[0,1])
test.Embarked = test.Embarked.replace(['S','C','Q'],[0,1,2])
test = test[test.Embarked.notna()]
test.Cabin.fillna('U', inplace=True)
test['Deck'] = test.Cabin.str.slice(0,1)
test.Deck = test.Deck.replace(['U','C','E','G','D','A','B','F','T'],[0,1,2,3,4,5,6,7,8])

In [372]:
test['Title'] = test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')
 
test['Title'] = test['Title'].map(titles)
test['Title'] = test['Title'].fillna(0)

In [373]:
for value in test.Title.unique():
    df = test[test.Title==value]
    mean = df['Age'].mean()
    std = df['Age'].std()
    is_null = df['Age'].isnull().sum()
    
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)

    # fill NaN values in Age column with random values generated
    age_slice = df['Age'].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    df['Age'] = age_slice
    df['Age'] = df['Age'].astype(int)
    test.loc[test.Title==value,'Age'] = df.Age

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [390]:
test.isna().sum()

PassengerId        0
Pclass             0
Name               0
Sex                0
Age                0
SibSp              0
Parch              0
Ticket             0
Fare               0
Cabin              0
Embarked           0
Deck               0
Title              0
relatives          0
not_alone          0
Age_Class          0
Fare_Per_Person    0
Survived           0
dtype: int64

In [None]:
med_fare = train.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
# Filling the missing value in Fare with the median Fare of 3rd class alone passenger
train.Fare = train.Fare.fillna(med_fare)

In [None]:
# Passenger in the T deck is changed to A
idx = train[train.Deck == 'T'].index
train.loc[idx, 'Deck'] = 'A'

train['Deck'] = train['Deck'].replace(['A', 'B', 'C'], 'ABC')
train['Deck'] = train['Deck'].replace(['D', 'E'], 'DE')
train['Deck'] = train['Deck'].replace(['F', 'G'], 'FG')

train['Deck'].value_counts()

In [None]:
test['Deck'] = test['Deck'].replace(['A', 'B', 'C'], 'ABC')
test['Deck'] = test['Deck'].replace(['D', 'E'], 'DE')
test['Deck'] = test['Deck'].replace(['F', 'G'], 'FG')

test['Deck'].value_counts()

In [374]:
data = [train, test]
for dataset in data:
    dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['relatives'] > 0, 'not_alone'] = 0
    dataset.loc[dataset['relatives'] == 0, 'not_alone'] = 1
    dataset['not_alone'] = dataset['not_alone'].astype(int)

    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6
    dataset.loc[ dataset['Age'] > 66, 'Age'] = 6

    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare']   = 3
    dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare']   = 4
    dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    dataset['Age_Class']= dataset['Age']* dataset['Pclass']
    
    dataset['Fare_Per_Person'] = dataset['Fare']/(dataset['relatives']+1)
    dataset['Fare_Per_Person'] = dataset['Fare_Per_Person'].astype(int)
    
    dataset['Ticket_Frequency'] = dataset.groupby('Ticket')['Ticket'].transform('count')
    
    dataset['Is_Married'] = 0
    dataset['Is_Married'].loc[dataset['Title'] == 'Mrs'] = 1

In [375]:
X_train = train.loc[:,['Pclass','Sex','SibSp','Parch','Fare','Embarked', 'Age','Title','Deck','relatives','not_alone','Age_Class','Fare_Per_Person']]
y_train = train['Survived']

X_test = test.loc[:,['Pclass','Sex','SibSp','Parch','Fare','Embarked', 'Age', 'Title','Deck','relatives','not_alone','Age_Class','Fare_Per_Person']]

In [376]:
models = {('RForest',RandomForestClassifier()):{'random_state':[0],'max_depth':[None],'max_features':[None,5],
                                                'ccp_alpha':[0],
                                                'warm_start':[True],
                                                'n_estimators': [300],
                                                'max_samples': [100]}}

for (modelname,model), parameters in models.items():
    ml_fit_eval(X_train,y_train,X_test,y_test, model,parameters,modelname)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 2 candidates, totalling 20 fits
[CV] ccp_alpha=0, max_depth=None, max_features=None, max_samples=100, n_estimators=300, random_state=0, warm_start=True 
[CV]  ccp_alpha=0, max_depth=None, max_features=None, max_samples=100, n_estimators=300, random_state=0, warm_start=True, total=   0.5s
[CV] ccp_alpha=0, max_depth=None, max_features=None, max_samples=100, n_estimators=300, random_state=0, warm_start=True 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  ccp_alpha=0, max_depth=None, max_features=None, max_samples=100, n_estimators=300, random_state=0, warm_start=True, total=   0.9s
[CV] ccp_alpha=0, max_depth=None, max_features=None, max_samples=100, n_estimators=300, random_state=0, warm_start=True 
[CV]  ccp_alpha=0, max_depth=None, max_features=None, max_samples=100, n_estimators=300, random_state=0, warm_start=True, total=   0.6s
[CV] ccp_alpha=0, max_depth=None, max_features=None, max_samples=100, n_estimators=300, random_state=0, warm_start=True 
[CV]  ccp_alpha=0, max_depth=None, max_features=None, max_samples=100, n_estimators=300, random_state=0, warm_start=True, total=   0.5s
[CV] ccp_alpha=0, max_depth=None, max_features=None, max_samples=100, n_estimators=300, random_state=0, warm_start=True 
[CV]  ccp_alpha=0, max_depth=None, max_features=None, max_samples=100, n_estimators=300, random_state=0, warm_start=True, total=   0.8s
[CV] ccp_alpha=0, max_depth=None, max_features=None, max_samples=100, n_estimators=300, rando

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   12.9s finished


RForest Optimal Parameters :        {'ccp_alpha': 0, 'max_depth': None, 'max_features': None, 'max_samples': 100, 'n_estimators': 300, 'random_state': 0, 'warm_start': True}
RForest Cross Validation Accuracy:  83.58 % 



ValueError: Found input variables with inconsistent numbers of samples: [251, 418]

In [388]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

rfc = RandomForestClassifier(oob_score=True,random_state=10, max_depth = None, max_features = 5, ccp_alpha = 0,
                             warm_start = True, n_estimators = 300, max_samples=100).fit(X_train,y_train)

rfc.score(X_train,y_train)

0.8627671541057368

In [389]:
print("oob score:", round(rfc.oob_score_, 4)*100, "%")

oob score: 83.69 %


In [382]:
y_pred = logit.predict(X_test)

In [383]:
y_pred.shape

(418,)

In [384]:
test['Survived']=y_pred
predictions = test.loc[:,['PassengerId','Survived']]
predictions.to_csv('pred.csv',index=False)

In [None]:
predictions