In [1]:
!pwd

import os

/notebooks/_personalProjects/Titanic


In [2]:
import torch, numpy as np, pandas as pd
import zipfile
import seaborn as sns

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.width', 140)

In [3]:
zipName = 'titanic.zip'

zf = zipfile.ZipFile(zipName) 
df_gs = pd.read_csv(zf.open('gender_submission.csv'))
df_train = pd.read_csv(zf.open('train.csv'))
df_test = pd.read_csv(zf.open('test.csv'))

In [4]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.7,0.52,0.38,32.2
std,257.35,0.49,0.84,14.53,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.12,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


In [5]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
modes = df_train.mode().iloc[0]

In [8]:
df_train.fillna(modes, inplace=True)

In [9]:
df_train.isna().sum()
#isna and isnull seem to be the same

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [10]:
df_train.describe(exclude='object')

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,28.57,0.52,0.38,32.2
std,257.35,0.49,0.84,13.2,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.91
50%,446.0,0.0,3.0,24.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


In [11]:
df_train['logfare'] = np.log(df_train['Fare']+1)

In [12]:
pclasses = sorted(df_train.Pclass.unique())
pclasses

[1, 2, 3]

In [13]:
df_train = pd.get_dummies(df_train, columns=['Sex','Pclass','Embarked'])
df_train.columns

Index(['PassengerId', 'Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'logfare', 'Sex_female', 'Sex_male',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [14]:
added_cols = [ 'Sex_female', 'Sex_male',
               'Pclass_1', 'Pclass_2', 'Pclass_3', 
               'Embarked_C', 'Embarked_Q', 'Embarked_S']
df_train[added_cols].head()

Unnamed: 0,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,0,1,0,0,1
1,1,0,1,0,0,1,0,0
2,1,0,0,0,1,0,0,1
3,1,0,1,0,0,0,0,1
4,0,1,0,0,1,0,0,1


In [15]:
from torch import tensor

t_dep = tensor(df_train.Survived)

In [16]:
indep_cols = ['Age','SibSp','Parch','logfare'] + added_cols

t_indep = tensor(df_train[indep_cols].values, dtype=torch.float)
t_indep

tensor([[22.,  1.,  0.,  ...,  0.,  0.,  1.],
        [38.,  1.,  0.,  ...,  1.,  0.,  0.],
        [26.,  0.,  0.,  ...,  0.,  0.,  1.],
        ...,
        [24.,  1.,  2.,  ...,  0.,  0.,  1.],
        [26.,  0.,  0.,  ...,  1.,  0.,  0.],
        [32.,  0.,  0.,  ...,  0.,  1.,  0.]])

In [17]:
t_indep.shape

torch.Size([891, 12])

In [18]:
n_coeff = t_indep.shape[1]
coeffs = torch.rand(n_coeff)-0.5
coeffs

tensor([-0.3029,  0.4010,  0.1138,  0.3556,  0.3826,  0.0865, -0.1954, -0.3790,
         0.1617, -0.2695, -0.2680, -0.0842])

In [19]:
t_indep*coeffs

tensor([[ -6.6636,   0.4010,   0.0000,  ...,  -0.0000,  -0.0000,  -0.0842],
        [-11.5098,   0.4010,   0.0000,  ...,  -0.2695,  -0.0000,  -0.0000],
        [ -7.8751,   0.0000,   0.0000,  ...,  -0.0000,  -0.0000,  -0.0842],
        ...,
        [ -7.2693,   0.4010,   0.2276,  ...,  -0.0000,  -0.0000,  -0.0842],
        [ -7.8751,   0.0000,   0.0000,  ...,  -0.2695,  -0.0000,  -0.0000],
        [ -9.6925,   0.0000,   0.0000,  ...,  -0.0000,  -0.2680,  -0.0000]])

In [20]:
vals, indices = t_indep.max(dim=0)
t_indep = t_indep / vals

In [21]:
t_indep*coeffs

tensor([[-0.0833,  0.0501,  0.0000,  ..., -0.0000, -0.0000, -0.0842],
        [-0.1439,  0.0501,  0.0000,  ..., -0.2695, -0.0000, -0.0000],
        [-0.0984,  0.0000,  0.0000,  ..., -0.0000, -0.0000, -0.0842],
        ...,
        [-0.0909,  0.0501,  0.0379,  ..., -0.0000, -0.0000, -0.0842],
        [-0.0984,  0.0000,  0.0000,  ..., -0.2695, -0.0000, -0.0000],
        [-0.1212,  0.0000,  0.0000,  ..., -0.0000, -0.2680, -0.0000]])

In [22]:
vals

tensor([80.0000,  8.0000,  6.0000,  6.2409,  1.0000,  1.0000,  1.0000,  1.0000,
         1.0000,  1.0000,  1.0000,  1.0000])

In [23]:
preds = (t_indep*coeffs).sum(axis=1)
preds[:10]

tensor([ 0.2510,  0.0679,  0.4864,  0.2481,  0.1570,  0.0174, -0.1714,  0.5020,
         0.5380, -0.0730])

In [24]:
loss = torch.abs(preds-t_dep).mean()
loss

tensor(0.4945)

In [25]:
coeffs.requires_grad_()

tensor([-0.3029,  0.4010,  0.1138,  0.3556,  0.3826,  0.0865, -0.1954, -0.3790,
         0.1617, -0.2695, -0.2680, -0.0842], requires_grad=True)

In [26]:
loss = calc_loss(coeffs, t_indep, t_dep)
loss

NameError: name 'calc_loss' is not defined

In [None]:
loss.backward()

In [None]:
coeffs.grad

In [None]:
loss = calc_loss(coeffs, t_indep, t_dep)
loss.backward()
with torch.no_grad():
    coeffs.sub_(coeffs.grad * 0.1)
    coeffs.grad.zero_()
    print(calc_loss(coeffs, t_indep, t_dep))

In [None]:
from fastai.data.transforms import RandomSplitter

trn_split, val_split = RandomSplitter()(df_train)

In [None]:
trn_indep, val_indep = t_indep[trn_split], t_indep[val_split]
trn_dep, val_dep = t_dep[trn_split], t_dep[val_split]
len(trn_indep), len(val_indep)

In [None]:
# Instructions for Linear Model Machine Learning

'Gather: \
- dependent variables, independent variables \
- use pd.get_dummies(df, columns=[]) to create dummy columns \
- use tensor(df_train[indep_cols].values, dtype=torch.float) to get tensor \
- import from fastai.data.transforms import RandomSplitter \
-- use trn_split, val_split = RandomSplitter()(df) to get splits \
-- split test, train by using t_dep/indep[trn/val_split] \
-- index column dims with special value None trn/val_dep = trn/val_dep[:, None]


def calc_coeffs(t_indep):
    'calculates random set of coefficients based on shape of \
    independent variables tensor'
    n_coeff = t_indep.shape[1]
    return (torch.rand(n_coeff,1)*0.1).requires_grad_()

def calc_preds(coeffs, indeps): 
    'calculates predictions based on coefficients * independent variables'
    #return (indeps*coeffs).sum(axis=1)
    #return indeps@coeffs
    return torch.sigmoid(indeps@coeffs)

def calc_loss(coeffs, indeps, deps): 
    'calculates the mean absolute difference loss'
    return torch.abs(calc_preds(coeffs, indeps)-deps).mean()

def update_coeffs(coeffs, lr):
    'Multiples the coefficients by the learning rate and resets \
     the gradients to zero'
    coeffs.sub_(coeffs.grad * lr)
    coeffs.grad.zero_()
    
def one_epoch(coeffs, lr):
    'Calculates the loss and indicates that a gradient is needed'
    loss = calc_loss(coeffs, trn_indep, trn_dep)
    loss.backward()
    #Disable gradient calculation
    with torch.no_grad(): update_coeffs(coeffs, lr)
    print(f"{loss:.3f}", end=";")
    
def train_model(epochs=30, lr=0.01, t_indep):
    coeffs = calc_coeffs(t_indep)
    for i in range(epochs): one_epoch(coeffs, lr=lr)
    return coeffs

def show_coeffs(): return dict(zip(indep_cols, coeffs.requires_grad_(False)))

def acc(coeffs): return (val_dep.bool()==(calc_preds(coeffs, val_indep)>0.5)).float().mean()

In [None]:
coeffs = train_model(18, lr=0.2)

In [None]:
show_coeffs()

In [None]:
preds = calc_preds(coeffs, val_indep)

In [None]:
results = val_dep.bool()==(preds>0.5)
results[:16]

In [None]:
results.float().mean()

In [None]:
acc(coeffs)