In [33]:
import torch
from torch import nn
from torch import _nnpack_available
from torch.optim.swa_utils import SWALR
from torch.utils.data import TensorDataset, Dataset, DataLoader

import os

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn import preprocessing

import numpy as np
import matplotlib.pyplot as plt

In [34]:
# Lendo os arquivos de base de dados
original_database = pd.read_csv('data/jm1.csv')
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

data = pd.concat([train_data, original_database], axis=0, ignore_index=True)
train_data
data_id = train_data.pop('id')

# Coluna de rótulos
label_name = 'defects'
# Transformação do rótulo para 0 e 1
data[label_name] = data[label_name].map({False: 0, True: 1})

# Função para substituir valores "descartáveis" por 'NaN'
def replace_with_nan(element):
    if type(element) == str and not element.isalnum():
        element = "NaN"
    return element

# Colunas que podem ter valores faltantes
fix_cols = ["uniq_Op", "uniq_Opnd", "total_Op", "total_Opnd", "branchCount"]

def fix_columns(df, cols):
    # Cópia do dataframe
    df_new = df.copy(deep = True)
    # Aplicar a função 'replace_with_nan' para cara elemento de uma coluna
    for col in cols:
        df_new[col] = df_new[col].apply(replace_with_nan).astype("float")
    return df_new

# Dataframe ajustado por 'replace_with_nan'
data_fixed = fix_columns(data, fix_cols)

data_fixed_drop = data_fixed.dropna() # Remover valores nulos
data_fixed_drop = data_fixed_drop.drop(labels='id',axis=1) # Remover coluna 'id'

# Remover alguns atributos
drop_cols = ["v(g)", "ev(g)", "l", "d", "i", "e", "t"] 
def drop_columns(df, cols):
    df_new = df.copy(deep=True)
    df_new = df_new.drop(labels=cols, axis=1)
    return df_new
# Dataframe ajustado e com menos atributos
data_fixed_drop = drop_columns(data_fixed_drop, drop_cols)

targets = 'defects'
train_set, test_set = train_test_split(data_fixed_drop,  
                                                    test_size=0.3,
                                                    random_state=30)
train_y = train_set['defects'].copy()
test_y = test_set['defects'].copy()
train_X = train_set.drop(labels='defects', axis=1)
test_X = test_set.drop(['defects'], axis=1)

# Adicionando features de média para alguns atributos
def add_feat(X):
    df=X.copy()
    df['mean_bnv']         = (df['n'] + df['v'] + df['b']) /3;
    df['mean_uniqOpOpend'] = (df['uniq_Op'] + df['uniq_Opnd']) /2;
    df['mean_totOpOpend']  = (df['total_Op'] + df['total_Opnd']) /2;
    return df
train_X = add_feat(train_X)
test_data = add_feat(test_data)

# Faz o ajuste de escala e transforma todos os atributos para float
def scale(df):
    scaler = preprocessing.RobustScaler()
    robust_df = scaler.fit_transform(df)
    robust_df = pd.DataFrame(robust_df, columns =df.columns)
    return robust_df
train_X = scale(train_X)
test_data =  scale(test_data)
test_X = scale(test_X)

# Transformando a base de treino em tensor float
train_X_tensor = torch.FloatTensor(train_X.values)
train_y_tensor = torch.LongTensor(train_y.values)
# Transformando a base de treino em tensor float
test_X_tensor = torch.FloatTensor(test_X.values)
test_y_tensor = torch.LongTensor(test_y.values)
# Datasets
train_dataset = TensorDataset(train_X_tensor, train_y_tensor)
val_dataset = TensorDataset(test_X_tensor, test_y_tensor)
# DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=128)
val_dataloader = DataLoader(val_dataset, batch_size=128)


In [35]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(17, 128),
            nn.ReLU(),
            nn.Linear(128, 2)
        )        

    def forward(self, x):
        logits = self.layers(x)
        return logits

In [36]:
def train_loop(dataloader, model, loss_fn, optimizer):
    # Modo treino
    model.train()
    # Tamanho do dataloader
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute predicition and loss
        prediction = model(X)
        loss = loss_fn(prediction, y)
        #Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1)*len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

def test_loop(dataloader, model, loss_fn):
    # Modo teste
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            prediction = model(X)
            test_loss += loss_fn(prediction, y).item()
            correct += (prediction.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {100*correct:>5f}%, Average loss: {test_loss:>8f}\n")

In [37]:
learning_rate = 1e-3
epochs = 50
loss_fn = nn.CrossEntropyLoss()

model = NeuralNetwork()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
swa_model = torch.optim.swa_utils.AveragedModel(model)
swa_start = 38
swa_scheduler = SWALR(optimizer, swa_lr=0.05)

In [None]:
for t in range(epochs):
    print(f"Epoch {t+1}\n--------------")
    if (t+1) >= swa_start:
        print("SWA Starting.....")
    if (t+1) >= swa_start:
        train_loop(train_dataloader, swa_model, loss_fn, optimizer)
        test_loop(val_dataloader, swa_model, loss_fn)
    else:
        train_loop(train_dataloader, model, loss_fn, optimizer)
        test_loop(val_dataloader, model, loss_fn)
print("Done!")