In [0]:
# linear algebra
import numpy as np 
# data processing
import pandas as pd 
# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style
# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

In [2]:
test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")
train_df.info()
train_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668 entries, 0 to 667
Data columns (total 14 columns):
pclass         668 non-null int64
sex            668 non-null object
age            539 non-null float64
sibsp          668 non-null int64
parch          668 non-null int64
fare           668 non-null float64
embarked       666 non-null object
class          668 non-null object
who            668 non-null object
adult_male     668 non-null bool
deck           149 non-null object
embark_town    666 non-null object
alive          668 non-null object
alone          668 non-null bool
dtypes: bool(2), float64(2), int64(3), object(7)
memory usage: 64.1+ KB


Unnamed: 0,pclass,age,sibsp,parch,fare
count,668.0,539.0,668.0,668.0,668.0
mean,2.323353,29.537106,0.508982,0.387725,31.938678
std,0.837784,14.554157,1.103997,0.823729,48.989578
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,20.0,0.0,0.0,7.8958
50%,3.0,28.0,0.0,0.0,14.4583
75%,3.0,39.0,1.0,0.0,31.275
max,3.0,80.0,8.0,6.0,512.3292


In [3]:
total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

Unnamed: 0,Total,%
deck,519,77.7
age,129,19.3
embark_town,2,0.3
embarked,2,0.3
alone,0,0.0


In [4]:
train_df.columns.values

array(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked',
       'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive',
       'alone'], dtype=object)

Preprocessing

In [0]:
train_df = train_df.drop(['embark_town'], axis=1)
train_df = train_df.drop(['parch'], axis=1)
train_df = train_df.drop(['deck'], axis=1)
train_df = train_df.drop(['class'], axis=1)
test_df = test_df.drop(['embark_town'], axis=1)
test_df = test_df.drop(['parch'], axis=1)
test_df = test_df.drop(['deck'], axis=1)
test_df = test_df.drop(['class'], axis=1)

data = [train_df, test_df]

Converting Values

In [0]:
alive = {"yes": 1, "no": 0}
alone = {True: 1, False: 0}
adult_male = {True: 1, False: 0}
who = {"man": 1, "woman": 2, "child": 0}
for dataset in data:
  dataset['alive'] = dataset['alive'].map(alive)
  dataset['alone'] = dataset['alone'].map(alone)
  dataset['adult_male'] = dataset['adult_male'].map(alone)
  dataset['who'] = dataset['who'].map(who)

In [7]:
common_value = 'S'
for dataset in data:
    dataset['embarked'] = dataset['embarked'].fillna(common_value)
train_df['embarked'].describe()

count     668
unique      3
top         S
freq      479
Name: embarked, dtype: object

In [0]:
for dataset in data:
    dataset['age'] = dataset['age'].fillna(0)
    dataset['fare'] = dataset['fare'].fillna(0)
    dataset['fare'] = dataset['fare'].astype(int)

In [0]:
genders = {"male": 0, "female": 1}
for dataset in data:
    dataset['sex'] = dataset['sex'].map(genders)

In [0]:
ports = {"S": 0, "C": 1, "Q": 2}
data = [train_df, test_df]

for dataset in data:
    dataset['embarked'] = dataset['embarked'].map(ports)

In [11]:
data = [train_df, test_df]
for dataset in data:
    dataset['age'] = dataset['age'].astype(int)
    dataset.loc[ dataset['age'] <= 11, 'age'] = 0
    dataset.loc[(dataset['age'] > 11) & (dataset['age'] <= 18), 'age'] = 1
    dataset.loc[(dataset['age'] > 18) & (dataset['age'] <= 22), 'age'] = 2
    dataset.loc[(dataset['age'] > 22) & (dataset['age'] <= 27), 'age'] = 3
    dataset.loc[(dataset['age'] > 27) & (dataset['age'] <= 33), 'age'] = 4
    dataset.loc[(dataset['age'] > 33) & (dataset['age'] <= 40), 'age'] = 5
    dataset.loc[(dataset['age'] > 40) & (dataset['age'] <= 66), 'age'] = 6
    dataset.loc[ dataset['age'] > 66, 'age'] = 6

# let's see how it's distributed 
train_df['age'].value_counts()

0    182
6    113
4     93
3     78
5     74
2     71
1     57
Name: age, dtype: int64

In [0]:
for dataset in data:
    dataset.loc[ dataset['fare'] <= 7.91, 'fare'] = 0
    dataset.loc[(dataset['fare'] > 7.91) & (dataset['fare'] <= 14.454), 'fare'] = 1
    dataset.loc[(dataset['fare'] > 14.454) & (dataset['fare'] <= 31), 'fare']   = 2
    dataset.loc[(dataset['fare'] > 31) & (dataset['fare'] <= 99), 'fare']   = 3
    dataset.loc[(dataset['fare'] > 99) & (dataset['fare'] <= 250), 'fare']   = 4
    dataset.loc[ dataset['fare'] > 250, 'fare'] = 5
    dataset['fare'] = dataset['fare'].astype(int)

In [13]:
test_df.head(200)

Unnamed: 0,pclass,sex,age,sibsp,fare,embarked,who,adult_male,alive,alone
0,3,0,2,0,0,1,1,1,1,1
1,2,0,0,1,1,0,0,0,1,0
2,3,1,0,3,2,0,0,0,0,0
3,3,1,2,0,1,0,2,0,0,1
4,3,1,4,0,1,0,2,0,1,1
...,...,...,...,...,...,...,...,...,...,...
195,1,1,5,0,4,0,2,0,1,1
196,1,1,5,1,4,0,2,0,1,0
197,2,0,5,0,1,0,1,1,0,1
198,2,0,2,1,1,0,1,1,0,0


In [0]:
X_train = train_df.drop("alive", axis=1)
X_test = test_df.drop("alive", axis = 1)
Y_train = train_df["alive"]

In [15]:
test_df.head()

Unnamed: 0,pclass,sex,age,sibsp,fare,embarked,who,adult_male,alive,alone
0,3,0,2,0,0,1,1,1,1,1
1,2,0,0,1,1,0,0,0,1,0
2,3,1,0,3,2,0,0,0,0,0
3,3,1,2,0,1,0,2,0,0,1
4,3,1,4,0,1,0,2,0,1,1


In [16]:
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)

sgd.score(X_train, Y_train)

acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

80.69

In [0]:
# save to file
train_df.to_csv(r'titanic_train.csv', index=None, header=True)
X_test.to_csv(r'titanic_test.csv', index=None, header=True)