# ML Model

In [1]:
import torch, numpy as np, pandas as pd
from torch import tensor
from fastai.data.transforms import RandomSplitter
import zipfile

zipName = 'titanic.zip'

zf = zipfile.ZipFile(zipName) 
df_gs = pd.read_csv(zf.open('gender_submission.csv'))
df_train = pd.read_csv(zf.open('train.csv'))
df_test = pd.read_csv(zf.open('test.csv'))

In [2]:
modes = df_train.mode().iloc[0]
df_train.fillna(modes, inplace=True)

In [3]:
df_train['logfare'] = np.log(df_train['Fare']+1)

In [4]:
df_train = pd.get_dummies(df_train, columns=['Sex','Pclass','Embarked'])

added_cols = [ 'Sex_female', 'Sex_male',
               'Pclass_1', 'Pclass_2', 'Pclass_3', 
               'Embarked_C', 'Embarked_Q', 'Embarked_S']

In [5]:
indep_cols = ['Age','SibSp','Parch','logfare'] + added_cols

t_dep = tensor(df_train.Survived)
t_indep = tensor(df_train[indep_cols].values, dtype=torch.float)
vals, indices = t_indep.max(dim=0)
t_indep = t_indep / vals

In [6]:
trn_split, val_split = RandomSplitter(seed=42)(df_train)
trn_indep, val_indep = t_indep[trn_split], t_indep[val_split]
trn_dep, val_dep = t_dep[trn_split], t_dep[val_split]
len(trn_indep),len(val_indep)

(713, 178)

In [7]:
# Instructions for Linear Model Machine Learning

'Gather: \
- dependent variables, independent variables \
- use pd.get_dummies(df, columns=[]) to create dummy columns \
- use tensor(df_train[indep_cols].values, dtype=torch.float) to get tensor \
- import from fastai.data.transforms import RandomSplitter \
-- use trn_split, val_split = RandomSplitter()(df) to get splits \
-- split test, train by using t_dep/indep[trn/val_split] \
-- index column dims with special value None trn/val_dep = trn/val_dep[:, None]'

def init_coeffs():
    'calculates random set of coefficients based on shape of \
    independent variables tensor'
    return (torch.rand(n_coeff)-0.5).requires_grad_()

def calc_preds(coeffs, indeps): 
    'calculates predictions based on coefficients * independent variables'
    #return (indeps*coeffs).sum(axis=1)
    #return indeps@coeffs
    return torch.sigmoid(indeps@coeffs)

def calc_loss(coeffs, indeps, deps): 
    'calculates the mean absolute difference loss'
    return torch.abs(calc_preds(coeffs, indeps)-deps).mean()

def update_coeffs(coeffs, lr):
    'Multiples the coefficients by the learning rate and resets \
     the gradients to zero'
    coeffs.sub_(coeffs.grad * lr)
    coeffs.grad.zero_()
    
def one_epoch(coeffs, lr):
    'Calculates the loss and indicates that a gradient is needed'
    loss = calc_loss(coeffs, trn_indep, trn_dep)
    loss.backward()
    #Disable gradient calculation
    with torch.no_grad(): update_coeffs(coeffs, lr)
    print(f"{loss:.3f}", end=";")
    
def train_model(epochs=30, lr=0.01):
    coeffs = init_coeffs()
    for i in range(epochs): one_epoch(coeffs, lr=lr)
    return coeffs

def show_coeffs(): return dict(zip(indep_cols, coeffs.requires_grad_(False)))

def acc(coeffs): return (val_dep.bool()==(calc_preds(coeffs, val_indep)>0.5)).float().mean()

In [8]:
torch.manual_seed(442)

n_coeff = t_indep.shape[1]
coeffs = init_coeffs()

model = train_model(lr=100);

0.468;0.324;0.301;0.209;0.201;0.199;0.198;0.197;0.196;0.196;0.196;0.195;0.195;0.195;0.195;0.195;0.195;0.195;0.194;0.194;0.194;0.194;0.194;0.194;0.194;0.194;0.194;0.194;0.194;0.194;

In [10]:
acc(model)

tensor(0.8258)

In [11]:
show_coeffs()

{'Age': tensor(-0.4629),
 'SibSp': tensor(0.1386),
 'Parch': tensor(0.2409),
 'logfare': tensor(-0.2262),
 'Sex_female': tensor(-0.2632),
 'Sex_male': tensor(-0.3147),
 'Pclass_1': tensor(0.4876),
 'Pclass_2': tensor(0.3136),
 'Pclass_3': tensor(0.2799),
 'Embarked_C': tensor(-0.4392),
 'Embarked_Q': tensor(0.2103),
 'Embarked_S': tensor(0.3625)}

In [19]:
acc(model)

tensor(0.8258)

In [13]:
df_test['Fare'] = df_test.Fare.fillna(0)
df_test.fillna(modes, inplace=True)
df_test['logfare'] = np.log(df_test['Fare']+1)
df_test = pd.get_dummies(df_test, columns=["Sex","Pclass","Embarked"])

tst_indep = tensor(df_test[indep_cols].values, dtype=torch.float)
tst_indep = tst_indep / vals

In [20]:
df_test['Survived'] = (calc_preds(model,tst_indep)>0.5).int()

In [15]:
sub_df = df_test[['PassengerId','Survived']]
sub_df.to_csv('sub.csv', index=False)

In [16]:
!head sub.csv

PassengerId,Survived
892,0
893,1
894,0
895,1
896,1
897,1
898,0
899,1
900,0
