# Titanic Predictions 

## Get the Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

## Explore the Data

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
train_data.describe()

In [None]:
train_data.hist(bins=10, figsize=(20,15))

In [None]:
train_data.plot(kind='scatter', x='Age', y='Survived')

### Looking for Correlations

In [None]:
corr_matrix = train_data.corr()
corr_matrix['Survived'].sort_values(ascending=False)

In [None]:
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1


In [None]:
train_data.info()

In [None]:
corr_matrix = train_data.corr()
corr_matrix['Survived'].sort_values(ascending=False)

### Fare


In [None]:
#train_data['Fare'] = train_data.loc[train_data['Fare'] < 100]['Fare']
train_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)
#test_data['Fare'] = test_data.loc[test_data['Fare'] < 100]['Fare']
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)


In [None]:
train_data.describe()

In [None]:
train_data.Fare.hist(bins=10)


### Sex

In [None]:
women = train_data.loc[train_data['Sex']=='female']['Survived']
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

In [None]:
men = train_data.loc[train_data['Sex']=='male']['Survived']
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

In [None]:
# One-Hot encode Ticket
train_data.Sex = pd.get_dummies(train_data.Sex)
test_data.Sex = pd.get_dummies(test_data.Sex)

### Age


In [None]:
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)


In [None]:
train_data['IsAlone'] = 0
train_data.loc[train_data['FamilySize'] == 1, 'IsAlone'] = 1
test_data['IsAlone'] = 0
test_data.loc[test_data['FamilySize'] == 1, 'IsAlone'] = 1

train_data['Age*Class'] = train_data.Age * train_data.Pclass
test_data['Age*Class'] = test_data.Age * test_data.Pclass

In [None]:
old = train_data.loc[train_data['Age'] > 45]['Survived']
rate_old = sum(old)/len(old)

print("% of old people who survived:", rate_old)

In [None]:
mid_age = train_data.loc[(train_data['Age'] <= 45) & (train_data['Age'] >= 15)]['Survived']
rate_mid = sum(mid_age)/len(mid_age)

print("% of middle aged people who survived:", rate_mid)

In [None]:
young = train_data.loc[train_data['Age'] < 15]['Survived']
rate_young = sum(young)/len(young)

print("% of young people who survived:", rate_young)

In [None]:
train_data['Age'] = pd.cut(train_data['Age'], bins=[0., 10., 25., 50, 80, np.inf], labels=[0,1,2,3,4]).astype(int)
test_data['Age'] = pd.cut(test_data['Age'], bins=[0., 10., 25., 50, 80, np.inf], labels=[0,1,2,3,4]).astype(int)


In [None]:
train_data.Age

In [None]:
# Missing Values 

print(train_data.isnull().sum())

train_data[train_data.Embarked.isnull()]

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
# Label Encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


#label encoder can't handle missing values
train_data.Embarked = train_data.Embarked.fillna('None')
test_data.Embarked = test_data.Embarked.fillna('None')

# Label encode Embarked 
label_encoder = LabelEncoder()
train_data.Embarked = label_encoder.fit_transform(train_data.Embarked)
test_data.Embarked = label_encoder.transform(test_data.Embarked)



In [None]:
corr_matrix = train_data.corr()
corr_matrix['Survived'].sort_values(ascending=False)

In [None]:
y = train_data["Survived"]

features = ["Pclass", "Sex", "Fare", "SibSp", "Parch","FamilySize", "Embarked", "IsAlone"]
X = train_data[features]
X_test = test_data[features]

print(X)

scoring_method = "f1"


# Modelling

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()

rf_params ={
    'bootstrap': [True, False],
    'max_depth': [10, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [100]}

rf_gs = GridSearchCV(rf_model, rf_params, scoring=scoring_method, cv=8, n_jobs=4)

rf_gs.fit(X, y)
print(rf_gs.best_params_)
print(rf_gs.best_score_)

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(rf_gs, X, y, cv=5)

In [None]:
predictions = random_forest.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your pipeline submission was successfully saved!")