<a href="https://colab.research.google.com/github/jefersondaniel/machine-learning-studies/blob/master/examples/titanic/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas scikit-learn kaggle



In [6]:
import io, os
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth

auth.authenticate_user()

done = False
filename = "/root/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)

drive_service = build('drive', 'v3')
results = drive_service.files().list(q="name = 'kaggle.json'", fields="files(id)").execute()
request = drive_service.files().get_media(fileId=results['files'][0]['id'])
downloader = MediaIoBaseDownload(io.FileIO(filename, 'wb'), request)

while not done:
    status, done = downloader.next_chunk()
    print("Download {}%.".format(int(status.progress() * 100)))

os.chmod(filename, 600)

Download 100%.


In [9]:
!kaggle competitions download -c titanic

Downloading train.csv to /content
  0% 0.00/59.8k [00:00<?, ?B/s]
100% 59.8k/59.8k [00:00<00:00, 49.4MB/s]
Downloading test.csv to /content
  0% 0.00/28.0k [00:00<?, ?B/s]
100% 28.0k/28.0k [00:00<00:00, 26.1MB/s]
Downloading gender_submission.csv to /content
  0% 0.00/3.18k [00:00<?, ?B/s]
100% 3.18k/3.18k [00:00<00:00, 6.73MB/s]


In [0]:
import pandas as pd

In [12]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
dataframes = [train_df, test_df]
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [0]:
# Convert Categorical Features

for df in dataframes:
    df['Family'] = df['SibSp'] + df['Parch'] + 1
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).fillna(0).astype(int)
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)

In [0]:
# Create Title Feature

for df in dataframes:
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    df['Title'] = df['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5})
    df['Title'] = df['Title'].fillna(0).astype(int)


In [0]:
# Convert Age Feature

pd.cut(df['Age'], 5).unique()

for df in dataframes:
    df.loc[ df['Age'] <= 16, 'Age'] = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
    df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
    df.loc[ df['Age'] > 64, 'Age'] = 4
    df['Age'] = df['Age'].fillna(df['Age'].mode()[0]).astype(int)

In [0]:
# Convert Fare Feature

pd.cut(df['Fare'], 3).unique()

for df in dataframes:
    df.loc[ df['Fare'] <= 170, 'Fare'] = 0
    df.loc[(df['Fare'] > 170) & (df['Fare'] <= 340), 'Fare'] = 1
    df.loc[(df['Fare'] > 340), 'Fare'] = 3
    df['Fare'] = df['Fare'].fillna(df['Fare'].mode()[0]).astype(int)
    

In [0]:
# Drop Not Used Features

for df in dataframes:
    df.drop(['Ticket', 'Cabin', 'SibSp', 'Parch', 'Name'], axis=1, inplace=True)

In [23]:
# Model Training

from sklearn.ensemble import RandomForestClassifier

X_train = train_df.drop(["Survived", "PassengerId"], axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
# Evaluate on training tests

from sklearn.metrics import accuracy_score

Y_pred = model.predict(X_train)

round(accuracy_score(Y_train, Y_pred) * 100, 2)

86.42

In [25]:
# Submit results

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': model.predict(X_test)
})

display(submission.head())

submission.to_csv('submission.csv', index=False)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [28]:
!kaggle competitions submit titanic -f submission.csv -m "random forest"

100% 2.77k/2.77k [00:02<00:00, 1.28kB/s]
Successfully submitted to Titanic: Machine Learning from Disaster