# Simple tutorial only using numerical values


In [1]:
import numpy as np
import pandas as pd

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Loading data

In [2]:
#import train and test CSV files
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
len(train)

891

In [4]:
print('[train dataset null check]')
print(pd.isnull(train).sum())
print('==========================')

print('[test dataset null check]')
print(pd.isnull(test).sum())

# which columns to drop? = ['Name','Age','Ticket', 'Cabin'] # null or alphanumerical values

[train dataset null check]
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
[test dataset null check]
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [5]:
# map categorical values with characters to number
sex_mapping = {"male": 0, "female": 1}
train['Sex'] = train['Sex'].map(sex_mapping)
test['Sex'] = test['Sex'].map(sex_mapping)

embarked_mapping = {"S": 1, "C": 2, "Q": 3}
train['Embarked'] = train['Embarked'].map(embarked_mapping)
test['Embarked'] = test['Embarked'].map(embarked_mapping)

test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,3
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,3
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,1


In [6]:
test_fare_mean = round(test["Fare"].mean(), 4) # will fill in empty cell with this values
test_fare_mean

35.6272

In [7]:
from sklearn.model_selection import train_test_split 

train["Embarked"] = train["Embarked"].fillna(3) # 1, 2, 3
test["Embarked"] = test["Embarked"].fillna(3)

test["Fare"] = test["Fare"].fillna(test_fare_mean)

drop_columns = ['Name','Age','Ticket','Cabin']
train_simple = train.drop(drop_columns, axis=1)

train_drop_columns = ['Survived', 'PassengerId']
train_simple = train_simple.drop(train_drop_columns, axis=1)

test_simple = test.drop(drop_columns, axis=1)

target = train["Survived"]
x_train, x_val, y_train, y_val = train_test_split(train_simple, target, test_size = 0.22, random_state = 0)

In [8]:
train_simple.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,3,0,1,0,7.25,1.0
1,1,1,1,0,71.2833,2.0
2,3,1,0,0,7.925,1.0
3,1,1,1,0,53.1,1.0
4,3,0,0,0,8.05,1.0


In [9]:
print(pd.isnull(train_simple).sum())

Pclass      0
Sex         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [10]:
test_simple.head()

Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,892,3,0,0,0,7.8292,3
1,893,3,1,1,0,7.0,1
2,894,2,0,0,0,9.6875,3
3,895,3,0,0,0,8.6625,1
4,896,3,1,1,1,12.2875,1


In [11]:
print(pd.isnull(test_simple).sum())

PassengerId    0
Pclass         0
Sex            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64


In [12]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_val)
acc_logreg = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_logreg)

80.71


In [13]:
# Perceptron
from sklearn.linear_model import Perceptron

perceptron = Perceptron()
perceptron.fit(x_train, y_train)
y_pred = perceptron.predict(x_val)
acc_perceptron = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_perceptron)

69.54


In [14]:
test_simple.head()

Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,892,3,0,0,0,7.8292,3
1,893,3,1,1,0,7.0,1
2,894,2,0,0,0,9.6875,3
3,895,3,0,0,0,8.6625,1
4,896,3,1,1,1,12.2875,1


In [15]:
# SUBMISSION
model = logreg 
ids = test_simple['PassengerId']
predictions = model.predict(test_simple.drop('PassengerId', axis=1))

output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [16]:
output.to_csv('submission.csv', index=False)