In [339]:
import numpy as np  
import pandas as pd 
import os


data = []

for dirpath, dirnames, filenames in os.walk('data\\'):
    for filename in filenames:
        data.append(os.path.join('data', filename))


gender_submission_path, test_path, train_path = data

train = pd.read_csv(train_path)

test = pd.read_csv(test_path)


In [340]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### Data check
Both test and train contain missing values 

In [341]:
train.isnull().sum()
'''
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
'''
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

#### Taking care of lost or wrong data

As of right now im only filling age columns with its median, ignoring cabin column

In [342]:
from sklearn.impute import SimpleImputer

def fill_nas(df):
    median = df['Age'].median()
    return df['Age'].fillna(median)

In [343]:
train['Age'] = fill_nas(train)
test['Age'] = fill_nas(test)

#### Feature engineering

In [344]:
def feature_engineering(df):
    df['FamSize'] = df['SibSp'] + df['Parch']
    df['IsAlone'] = (df['FamSize'] >= 1).astype(int)
    df['FarePerPerson'] = np.where(df['FamSize'] > 1, df['Fare']/df['FamSize'],  df['Fare'])
    df['NameLength'] = df['Name'].apply(len)

In [345]:
feature_engineering(train)
feature_engineering(test)

#### Encoding

In [346]:
def encode(df):
    return pd.get_dummies(df, columns=['Sex', 'Embarked'], dtype=int)

In [347]:
train = encode(train)
test = encode(test)

#### Choosing features

In [348]:
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamSize', 'IsAlone', 'FarePerPerson',
            'NameLength', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']

In [349]:
X = train[features]
y = train['Survived']

X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,FamSize,IsAlone,FarePerPerson,NameLength,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.2500,1,1,7.250000,23,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,1,71.283300,51,1,0,1,0,0
2,3,26.0,0,0,7.9250,0,0,7.925000,22,1,0,0,0,1
3,1,35.0,1,0,53.1000,1,1,53.100000,44,1,0,0,0,1
4,3,35.0,0,0,8.0500,0,0,8.050000,24,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,0,13.000000,21,0,1,0,0,1
887,1,19.0,0,0,30.0000,0,0,30.000000,28,1,0,0,0,1
888,3,28.0,1,2,23.4500,3,1,7.816667,40,1,0,0,0,1
889,1,26.0,0,0,30.0000,0,0,30.000000,21,0,1,1,0,0


In [350]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

#### Model choosing and validation




In [351]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix


'''param_grid = {
    'n_estimators': [100, 200],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_depth': [2, 4, 6, 8, 10],
    'max_features': [2, 4, 6, 8, 10, 12],
    'max_leaf_nodes': [5, 10, 15, 20, 25],
    'max_samples': [50, 100, 250, 500]
}

titanic_model = RandomForestClassifier(random_state=1)

grid_search = GridSearchCV(estimator=titanic_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(train_X, train_y)

print(f'Best accuracy score:\n {grid_search.best_score_}')
print(f'Best params:\n {grid_search.best_params_}')'''


titanic_model = RandomForestClassifier(max_depth=10, max_features=8, max_leaf_nodes=25, max_samples=500, 
                                       n_estimators=100, min_samples_split=2, min_samples_leaf=2, random_state=1)

titanic_model.fit(train_X, train_y)

prediction_for_val = titanic_model.predict(val_X)

cv_score = cross_val_score(titanic_model, train_X, train_y, cv=5)
titanic_val_accuracy_score = accuracy_score(y_pred=prediction_for_val, y_true=val_y)
titanic_val_confusion_matrix = confusion_matrix(y_pred=prediction_for_val, y_true=val_y)

print(f'Accuracy for random forest classifier:\n {titanic_val_accuracy_score}')
print(f'Confusion Matrix for random forest classifier:\n {titanic_val_confusion_matrix}')
print(f'CV score for random forest classifier:\n {cv_score}')

#0.8071748878923767

Accuracy for random forest classifier:
 0.7937219730941704
Confusion Matrix for random forest classifier:
 [[118  10]
 [ 36  59]]
CV score for random forest classifier:
 [0.79104478 0.8358209  0.86567164 0.84962406 0.84210526]


#### Actual prediction

In [352]:
prediction = pd.DataFrame(test['PassengerId'])

features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamSize', 'IsAlone', 'FarePerPerson',
            'NameLength', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']

test = test[features]

In [353]:
prediction['Survived'] = titanic_model.predict(test)

prediction

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


### exporting results into a CSV file

In [406]:
def export_results(prediciton_df):
    
    data = []

    for dirpath, dirnames, filenames in os.walk('submissions\\'):
        for filename in filenames:
            data.append(filename.split('_')[1])
    
    files_numbers = [int(number.split('.')[0]) for number in data]
    
    new_sub_name = ''.join(['submissions\\submission_', str(max(files_numbers) + 1), '.csv'])

    prediciton_df.to_csv(path_or_buf=new_sub_name, columns=['PassengerId', 'Survived'], index=False)
    
    return new_sub_name

In [407]:
sub_list = export_results(prediction)

sub_list

'submissions\\submission_12.csv'

#### Comparing submissions

In [408]:
def get_sub_list():
    
    sub_list = []

    for dirpath, dirnames, filenames in os.walk('submissions\\'):
        for filename in filenames:
            sub_list.append(os.path.join('submissions\\', filename))
    
    return sub_list

In [409]:
get_sub_list()

['submissions\\submission_1.csv',
 'submissions\\submission_10.csv',
 'submissions\\submission_11.csv',
 'submissions\\submission_12.csv',
 'submissions\\submission_2.csv',
 'submissions\\submission_3.csv',
 'submissions\\submission_4.csv',
 'submissions\\submission_5.csv',
 'submissions\\submission_6.csv',
 'submissions\\submission_7.csv',
 'submissions\\submission_8.csv',
 'submissions\\submission_9.csv']

get a submission from the list above and use it in `compare_submissions` function

In [412]:
def comapre_submissions(sub_a, sub_b):
    sub_a = pd.read_csv(sub_a)
    sub_b = pd.read_csv(sub_b)
    
    different_predictions = 0
    
    for a, b in zip(sub_a['Survived'], sub_b['Survived']):
        if a != b: different_predictions += 1
        
    return different_predictions
    
    
comapre_submissions('submissions\\submission_12.csv', 'submissions\\submission_2.csv')

76