In [1]:
from src import dataset as ds
import seaborn as sns
import numpy as np
import pandas as pd
import os
data_dir = os.getcwd() + "/data/"

In [3]:
import torch
import torchvision
import torchvision.transforms as transforms

In [6]:
train = pd.read_csv(data_dir + 'data_float.csv').drop(columns=["Unnamed: 0"])

In [7]:
y = train['is_churn']
x = train.drop(columns=['is_churn']).set_index('msno')

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
  x, y, test_size=0.2)

In [77]:
from torch.utils.data import Dataset, DataLoader
class Data(Dataset):
    def __init__(self, x_train, y_train):
        # need to convert float64 to float32 else 
        # will get the following error
        # RuntimeError: expected scalar type Double but found Float
        #self.x = torch.from_numpy(x_train.astype(np.float32))
        
        self.x = torch.tensor(x_train.values)
        
        self.y = torch.tensor(y_train.values)
        
        
        # need to convert float64 to Long else 
        # will get the following error
        # RuntimeError: expected scalar type Long but found Float
        #self.y = torch.from_numpy(y_train).type(torch.LongTensor)
        self.len = self.x.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]
    def __len__(self):
        return self.len

In [78]:
torchdata = Data(x_train, y_train)

In [172]:
torchdata[0]

(tensor([ 3.4444e+00, -3.7763e-04,  1.0741e+00,  1.9904e-02,  7.0370e-01,
         -6.1208e-03,  5.5556e-01,  2.7567e-02,  1.5963e+01, -4.0753e-02,
          1.9926e+01, -1.5656e-02,  4.5361e+03, -6.6927e+00,  9.0000e-01,
          1.5000e+01,  2.2000e+01,  2.0000e+00,  3.0000e+00,  2.0141e+07,
          4.0000e+01,  3.0000e+01,  1.4900e+02,  1.4900e+02,  2.0000e+00,
          1.0000e+00,  2.0170e+07,  2.0170e+07,  1.0000e+00,  0.0000e+00],
        dtype=torch.float64),
 tensor(0))

In [109]:
batch_size = 64
dataloader = DataLoader(torchdata, batch_size=batch_size, 
                         shuffle=True, num_workers=0)

In [173]:
import torch.nn as nn
# number of features (len of X cols)
input_dim = 30
# number of hidden layers
hidden_layers = 15
# number of classes (unique of y)
output_dim = 2


class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.linear1 = nn.Linear(30, 30)
        self.linear2 = nn.Linear(30, 15)
        self.linear3 = nn.Linear(15, 15)
        self.linear4 = nn.Linear(15, 2)
    def forward(self, x):
        #print(x.shape)
        x = torch.relu(self.linear1(x))
        x = torch.relu(self.linear2(x))
        x = torch.relu(self.linear3(x))
        x = torch.sigmoid(self.linear4(x))
        return x

In [174]:
net = Network()

In [175]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.0002)

In [176]:
epochs = 20
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(dataloader, 0):
        inputs, labels = data
        inputs = inputs.to(torch.float32)

        
        # set optimizer to zero grad to remove previous epoch gradients
        optimizer.zero_grad()
        # forward propagation
        outputs = net(inputs)
        
        #print(outputs)
        #print(labels)
        loss = criterion(outputs, labels)
        # backward propagation
        loss.backward()
        # optimize
        optimizer.step()
        running_loss += loss.item()
    # display statistics
    print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.5f}')

[1, 12137] loss: 2.44645
[2, 12137] loss: 2.44645
[3, 12137] loss: 2.44645


KeyboardInterrupt: 

In [140]:
testdata = Data(x_test, y_test)
testloader = DataLoader(testdata, batch_size=batch_size, 
                        shuffle=False, num_workers=0)

In [141]:
correct, total = 0, 0
# no need to calculate gradients during inference
with torch.no_grad():
    for data in testloader:
        inputs, labels = data
        inputs = inputs.to(torch.float32)
        # calculate output by running through the network
        outputs = net(inputs)
        # get the predictions
        __, predicted = torch.max(outputs.data, 1)
        # update results
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(f'Accuracy of the network on the {len(testdata)} test data: {100 * correct // total} %')

Accuracy of the network on the 194192 test data: 90 %
