In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F          # adds some efficiency
from torch.utils.data import DataLoader  # lets us load data in batches
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:


class TabularModel(nn.Module):

    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        # Call the parent __init__
        super().__init__()
        
        # Attributi per embedding,normalizzazione e dropout
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        # lista layers
        layerlist = []
        
        # numero emb e numero variabili continue
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb + n_cont
        
        # crea i layers lineat->activation->norm->dropout
        for i in layers:
            layerlist.append(nn.Linear(n_in,i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
                        
            #layerlist.append(nn.Dropout(p))
             
           
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
        
        # attrinuto layers
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        # prende parametri emb dai dati
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        # embedding e drop delle variabili categoria
        #x = self.emb_drop(x)
        
        # batch norm delle variabili continue
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        
        # hidden layers
        x = self.layers(x)
        return x

In [3]:
real_data=pd.read_csv('C:/Users/Public/Documents/titanic/test.csv')
gender_data=pd.read_csv('C:/Users/Public/Documents/titanic/gender_submission.csv')

In [4]:
train_data = pd.read_csv('C:/Users/Public/Documents/titanic/train.csv')
gender_data=pd.read_csv('C:/Users/Public/Documents/titanic/gender_submission.csv')
real_data=pd.read_csv('C:/Users/Public/Documents/titanic/test.csv')
del train_data['Cabin']
del real_data['Cabin']
del train_data['PassengerId']

del train_data['Ticket']
del real_data['Ticket']
del train_data['Name']
del real_data['Name']
del train_data['Embarked']
del real_data['Embarked']
TR=pd.concat([train_data,real_data])
nulT=train_data['Age'].isnull().sum()
nulR=real_data['Age'].isnull().sum()
ag=TR['Age'].dropna().values
fillR = [random.choice(ag) for x in range(nulR)]
real_data.loc[real_data.Age.isnull(), 'Age'] = fillR


In [5]:
real_data['Fare'].mean()

35.6271884892086

In [6]:
real_data.fillna(real_data['Fare'].mean())
real_data['Parch'] = real_data['Parch'].replace(9,6)



In [7]:
cat_cols=['Sex','Pclass','Parch','SibSp']
cont_cols=['Fare','Age']
y_col=['Survived']
for cat in cat_cols:
    real_data[cat] = real_data[cat].astype('category')
sx2= real_data['Sex'].cat.codes.values
pc2= real_data['Pclass'].cat.codes.values
pa2= real_data['Parch'].cat.codes.values
sb2= real_data['SibSp'].cat.codes.values
cats_real=np.stack([sx2,pc2,pa2,sb2],1)
cats_real = torch.tensor(cats_real, dtype=torch.int64)
conts_real=np.stack([real_data[col] for col in cont_cols],1)
conts_real= torch.tensor(conts_real, dtype= torch.float)


In [8]:
cat_szs = [len(real_data[col].cat.categories) for col in cat_cols]
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
emb_szs

[(2, 1), (3, 2), (7, 4), (7, 4)]

In [9]:
model2 = TabularModel(emb_szs, 2, conts_real.shape[1], [6,3],p=0.4)
model2.load_state_dict(torch.load('MyFirstTita.pt'));
model2.eval()

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(2, 1)
    (1): Embedding(3, 2)
    (2): Embedding(7, 4)
    (3): Embedding(7, 4)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=13, out_features=6, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=6, out_features=3, bias=True)
    (4): ReLU(inplace=True)
    (5): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=3, out_features=2, bias=True)
  )
)

In [112]:


with torch.no_grad():
    y_val = model2(cats_real, conts_real)
    

In [113]:
predicted = torch.max(y_val.data, 1)[1] 
predict_data = pd.DataFrame(predicted) 

In [117]:
Sub = pd.concat([real_data['PassengerId'], predict_data], axis=1)

In [121]:
Sub.columns = ['PassengerId', 'Survived']

In [122]:
Sub.to_csv(r'C:\Users\Public\Documents\SubTit.csv', index = False)

In [124]:
Sub

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
