In [111]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split

import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

import matplotlib.pyplot as plt


In [2]:
class MyDataset(torch.utils.data.Dataset):

  def __init__(self,df_x, df_y):
    self.x_train=torch.tensor(df_x,dtype=torch.float32)
    self.y_train=torch.tensor(df_y,dtype=torch.float32)

  def __len__(self):
    return len(self.y_train)
  
  def __getitem__(self,idx):
    return self.x_train[idx],self.y_train[idx] 

In [109]:
train_df = pd.read_csv("data/train_month_3_with_target.csv")

train_df = train_df.dropna(axis = 1)

y=train_df
y = y[["target"]]
X = train_df.drop(["target", "client_id"], axis = 1)
# X = X.drop(["customer_since_all","customer_since_bank","customer_birth_date", "customer_children","customer_relationship"], axis = 1) # For now
for col in ["customer_since_all","customer_since_bank","customer_birth_date", "customer_children","customer_relationship"]:
    try:
        X = X.drop(col, axis = 1)
    except:
        pass

X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.33, random_state=42)

training_set=MyDataset(X_train.values, y_train.values)
validation_set=MyDataset(X_val.values, y_val.values)

training_loader = torch.utils.data.DataLoader(training_set, batch_size=4, shuffle=True, num_workers=2)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=4, shuffle=False, num_workers=2)

learning_rate = 0.01

In [130]:
class Net(nn.Module):

    def __init__(self, input_size):
        super(Net, self).__init__()
        # # An affine operation: y = Wx + b de tipo todos contra todos
        self.Layer_1 = nn.Linear(input_size, 1)
        # self.Layer_2 = nn.Linear(20, 15)
        
        
        # # Define sigmoid activation and softmax output 
        self.Function = nn.Sigmoid()
        # self.linear = torch.nn.Linear(input_size, 1)
        

    def forward(self, inputs):
        return self.Layer_1(self.Function(inputs))

model = Net(X_train.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [46]:
## Testing the loss

for i, data in enumerate(training_loader):
    # Every data instance is an input + label pair
    inputs, labels = data
    outputs = model(inputs)
    # print(outputs.view(5), labels)
    loss = criterion(outputs.squeeze(), labels)

    


In [131]:
def train_one_epoch(epoch_index): #, tb_writer):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(training_loader):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = criterion(outputs.squeeze(), labels)
        
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            # print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            # tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [132]:

# from torch.utils.tensorboard import SummaryWriter

# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 5

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number) #, writer)

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(validation_loader):
        vinputs, vlabels = vdata
        voutputs = model(vinputs)
        vloss = criterion(voutputs.squeeze(), vlabels)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    # writer.add_scalars('Training vs. Validation Loss',
    #                 { 'Training' : avg_loss, 'Validation' : avg_vloss },
    #                 epoch_number + 1)
    # writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        print('model_{}_{}'.format(timestamp, epoch_number))

    epoch_number += 1

test_df = pd.read_csv("data/test_month_3.csv")

df_pred = pd.DataFrame()
df_pred["ID"] = test_df["client_id"]

test_df = test_df.dropna(axis = 1)

X_test = test_df.drop(["client_id"], axis = 1)
# X = X.drop(["customer_since_all","customer_since_bank","customer_birth_date", "customer_children","customer_relationship"], axis = 1) # For now
for col in ["customer_since_all","customer_since_bank","customer_birth_date", "customer_children","customer_relationship"]:
    try:
        X_test = X_test.drop(col, axis = 1)
    except:
        pass

new_data = torch.tensor(X_test.values).type(torch.FloatTensor)
old_data = torch.tensor(X.values).type(torch.FloatTensor)
with torch.no_grad():
    prediction = model(new_data)
    train_data = model(old_data)

df_pred["PROB"] = prediction.squeeze().data.detach().numpy()
y["PROB"] = train_data.squeeze().data.detach().numpy()

df_pred.to_csv(f"{timestamp}_attempt.csv", index = None)
y.to_csv(f"{timestamp}_train.csv", index = None)

EPOCH 1:
LOSS train 0.028775623555459706 valid 0.02901441603899002
model_20220329_120348_0
EPOCH 2:
LOSS train 0.03520748628765637 valid 0.0400262288749218
EPOCH 3:
LOSS train 0.034260290757730474 valid 0.029625317081809044
EPOCH 4:
LOSS train 0.035832407835012646 valid 0.035774219781160355
EPOCH 5:
LOSS train 0.0330128401989316 valid 0.02934136614203453


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y["PROB"] = train_data.squeeze().data.detach().numpy()
