In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split

import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

import matplotlib.pyplot as plt


In [29]:
class MyDataset(torch.utils.data.Dataset):

  def __init__(self,df_x, df_y):
    self.x_train=torch.tensor(df_x,dtype=torch.float32)
    self.y_train=torch.tensor(df_y,dtype=torch.float32)

  def __len__(self):
    return len(self.y_train)
  
  def __getitem__(self,idx):
    return self.x_train[idx],self.y_train[idx] 

In [14]:
train_df = pd.read_csv("data/weatherAUS.csv")

train_df.drop(['RISK_MM', 'Date'], axis=1, inplace=True)

# train_df = train_df.dropna(axis = 1)

In [16]:


X = train_df.drop(['RainTomorrow'], axis=1)
y = train_df['RainTomorrow']



In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [21]:
for df1 in [X_train, X_test]:
    for col in [var for var in train_df.columns if train_df[var].dtype!='O']:
        col_median=X_train[col].median()
        df1[col].fillna(col_median, inplace=True)      

In [22]:
for df2 in [X_train, X_test]:
    df2['WindGustDir'].fillna(X_train['WindGustDir'].mode()[0], inplace=True)
    df2['WindDir9am'].fillna(X_train['WindDir9am'].mode()[0], inplace=True)
    df2['WindDir3pm'].fillna(X_train['WindDir3pm'].mode()[0], inplace=True)
    df2['RainToday'].fillna(X_train['RainToday'].mode()[0], inplace=True)

In [23]:
def max_value(df3, variable, top):
    return np.where(df3[variable]>top, top, df3[variable])

for df3 in [X_train, X_test]:
    df3['Rainfall'] = max_value(df3, 'Rainfall', 3.2)
    df3['Evaporation'] = max_value(df3, 'Evaporation', 21.8)
    df3['WindSpeed9am'] = max_value(df3, 'WindSpeed9am', 55)
    df3['WindSpeed3pm'] = max_value(df3, 'WindSpeed3pm', 57)



In [24]:
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['RainToday'])

X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)

In [25]:


X_train = pd.concat([X_train[[var for var in train_df.columns if train_df[var].dtype!='O']], X_train[['RainToday_0', 'RainToday_1']],
                     pd.get_dummies(X_train.Location), 
                     pd.get_dummies(X_train.WindGustDir),
                     pd.get_dummies(X_train.WindDir9am),
                     pd.get_dummies(X_train.WindDir3pm)], axis=1)



In [27]:


X_test = pd.concat([X_test[[var for var in train_df.columns if train_df[var].dtype!='O']], X_test[['RainToday_0', 'RainToday_1']],
                     pd.get_dummies(X_test.Location), 
                     pd.get_dummies(X_test.WindGustDir),
                     pd.get_dummies(X_test.WindDir9am),
                     pd.get_dummies(X_test.WindDir3pm)], axis=1)



In [45]:
# train_df = pd.read_csv("data/weatherAUS.csv")

# train_df = train_df.drop(['RISK_MM'], axis=1, inplace=True)
# train_df = train_df.dropna(axis = 1)


# y=train_df
# y = y[["target"]]
# X = train_df.drop(["target", "client_id"], axis = 1)
# # X = X.drop(["customer_since_all","customer_since_bank","customer_birth_date", "customer_children","customer_relationship"], axis = 1) # For now
# for col in ["customer_since_all","customer_since_bank","customer_birth_date", "customer_children","customer_relationship"]:
#     try:
#         X = X.drop(col, axis = 1)
#     except:
#         pass

# X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.33, random_state=42)

training_set=MyDataset(X_train.values, y_train)
validation_set=MyDataset(X_test.values, y_test)

training_loader = torch.utils.data.DataLoader(training_set, batch_size=4, shuffle=True, num_workers=2)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=4, shuffle=False, num_workers=2)

learning_rate = 0.01

In [44]:
def funx(f):
    if f == "No":
        return 0
    return 1

y_test = [funx(x) for x in y_test]

In [63]:
class Net(nn.Module):

    def __init__(self, input_size):
        super(Net, self).__init__()
        # # An affine operation: y = Wx + b de tipo todos contra todos
        self.Layer_1 = nn.Linear(input_size, 1)
        # self.Layer_2 = nn.Linear(20, 15)
        
        
        # # Define sigmoid activation and softmax output 
        self.Function = nn.Sigmoid()
        # self.linear = torch.nn.Linear(input_size, 1)
        

    def forward(self, inputs):
        return self.Layer_1(self.Function(inputs))

# class Net(nn.Module):

#     def __init__(self, input_size):
#         super(Net, self).__init__()
#         self.linear = torch.nn.Linear(input_size, 1)
        
#     def forward(self, x):
#         outputs = torch.sigmoid(self.linear(x))
#         return outputs

# model = Net(X_train.shape[1])
# criterion = nn.BCELoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

model = Net(X_train.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [46]:
## Testing the loss

for i, data in enumerate(training_loader):
    # Every data instance is an input + label pair
    inputs, labels = data
    outputs = model(inputs)
    # print(outputs.view(5), labels)
    loss = criterion(outputs.squeeze(), labels)

    


In [64]:
def train_one_epoch(epoch_index): #, tb_writer):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(training_loader):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = criterion(outputs.squeeze(), labels)
        
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            # print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            # tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [65]:

# from torch.utils.tensorboard import SummaryWriter

# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 5

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number) #, writer)

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(validation_loader):
        vinputs, vlabels = vdata
        voutputs = model(vinputs)
        vloss = criterion(voutputs.squeeze(), vlabels)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    # writer.add_scalars('Training vs. Validation Loss',
    #                 { 'Training' : avg_loss, 'Validation' : avg_vloss },
    #                 epoch_number + 1)
    # writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        print('model_{}_{}'.format(timestamp, epoch_number))

    epoch_number += 1

new_data = torch.tensor(X_train.values).type(torch.FloatTensor)

with torch.no_grad():
    prediction = model(new_data)

df_pred = pd.DataFrame(y_train, columns=["target"])
df_pred["PROB"] = prediction.squeeze().data.detach().numpy()
df_pred.to_csv(f"{timestamp}_weather.csv", index = None)

EPOCH 1:
LOSS train 0.16972414526168722 valid 0.15502497553825378
model_20220329_115931_0
EPOCH 2:
LOSS train 0.16649042403465136 valid 0.14755696058273315
model_20220329_115931_1
EPOCH 3:
LOSS train 0.17200410632684363 valid 0.14455649256706238
model_20220329_115931_2
EPOCH 4:
LOSS train 0.1598510255033616 valid 0.1593029648065567
EPOCH 5:
LOSS train 0.17051410242007115 valid 0.1456998735666275


In [66]:
y_train

[0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
