In [74]:
import torch
from torch import nn
from torch import _nnpack_available
from torch.optim.swa_utils import SWALR
from torch.utils.data import TensorDataset, Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
import os

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn import preprocessing

import numpy as np
import matplotlib.pyplot as plt

In [75]:
# Lendo os arquivos de base de dados
original_database = pd.read_csv('data/jm1.csv')
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

data = pd.concat([train_data, original_database], axis=0, ignore_index=True)
train_data
data_id = train_data.pop('id')

# Coluna de rótulos
label_name = 'defects'
# Transformação do rótulo para 0 e 1
data[label_name] = data[label_name].map({False: 0, True: 1})

# Função para substituir valores "descartáveis" por 'NaN'
def replace_with_nan(element):
    if type(element) == str and not element.isalnum():
        element = "NaN"
    return element

# Colunas que podem ter valores faltantes
fix_cols = ["uniq_Op", "uniq_Opnd", "total_Op", "total_Opnd", "branchCount"]

def fix_columns(df, cols):
    # Cópia do dataframe
    df_new = df.copy(deep = True)
    # Aplicar a função 'replace_with_nan' para cara elemento de uma coluna
    for col in cols:
        df_new[col] = df_new[col].apply(replace_with_nan).astype("float")
    return df_new

# Dataframe ajustado por 'replace_with_nan'
data_fixed = fix_columns(data, fix_cols)

data_fixed_drop = data_fixed.dropna() # Remover valores nulos
data_fixed_drop = data_fixed_drop.drop(labels='id',axis=1) # Remover coluna 'id'

# Remover alguns atributos
drop_cols = ["v(g)", "ev(g)", "l", "d", "i", "e", "t"] 
def drop_columns(df, cols):
    df_new = df.copy(deep=True)
    df_new = df_new.drop(labels=cols, axis=1)
    return df_new
# Dataframe ajustado e com menos atributos
data_fixed_drop = drop_columns(data_fixed_drop, drop_cols)

targets = 'defects'
train_set, test_set = train_test_split(data_fixed_drop,  
                                                    test_size=0.3,
                                                    random_state=30)
train_y = train_set['defects'].copy()
test_y = test_set['defects'].copy()
train_X = train_set.drop(labels='defects', axis=1)
test_X = test_set.drop(['defects'], axis=1)

# Adicionando features de média para alguns atributos
def add_feat(X):
    df=X.copy()
    df['mean_bnv']         = (df['n'] + df['v'] + df['b']) /3;
    df['mean_uniqOpOpend'] = (df['uniq_Op'] + df['uniq_Opnd']) /2;
    df['mean_totOpOpend']  = (df['total_Op'] + df['total_Opnd']) /2;
    return df
train_X = add_feat(train_X)
test_X = add_feat(test_X)
test_data = add_feat(test_data)

# Faz o ajuste de escala e transforma todos os atributos para float
def scale(df):
    scaler = preprocessing.RobustScaler()
    robust_df = scaler.fit_transform(df)
    robust_df = pd.DataFrame(robust_df, columns =df.columns)
    return robust_df

train_X = scale(train_X)
test_data =  scale(test_data)
test_X = scale(test_X)

# Transformando a base de treino em tensor float
train_X_tensor = torch.FloatTensor(train_X.values)
train_y_tensor = torch.LongTensor(train_y.values)
# Transformando a base de treino em tensor float
test_X_tensor = torch.FloatTensor(test_X.values)
test_y_tensor = torch.LongTensor(test_y.values)
# Datasets
train_dataset = TensorDataset(train_X_tensor, train_y_tensor)
val_dataset = TensorDataset(test_X_tensor, test_y_tensor)
# DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=64)
val_dataloader = DataLoader(val_dataset, batch_size=64)


In [76]:
train_X

Unnamed: 0,loc,iv(g),n,v,b,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,mean_bnv,mean_uniqOpOpend,mean_totOpOpend
0,0.068966,0.000000,0.558140,0.634018,0.6250,0.105263,1.0,0.50,0.0,-0.250,0.846154,0.509804,0.742857,-0.25,0.623332,0.50,0.593023
1,0.068966,0.333333,0.395349,0.392939,0.3750,0.157895,0.0,0.75,0.0,-0.125,0.461538,0.490196,0.400000,0.25,0.393432,0.30,0.441860
2,7.206897,8.000000,6.872093,8.533254,8.3125,6.421053,44.0,1.50,0.0,1.500,3.615385,6.666667,7.257143,8.75,8.308724,3.00,6.895349
3,-0.103448,0.666667,-0.116279,-0.107039,-0.1250,0.105263,0.0,-0.25,0.0,-0.250,-0.153846,-0.137255,-0.114286,0.25,-0.110451,-0.15,-0.139535
4,-0.172414,-0.333333,0.000000,0.001793,0.0000,0.000000,0.0,-0.25,0.0,0.000,-0.384615,0.098039,-0.114286,0.00,0.000000,-0.20,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71229,-0.379310,0.333333,-0.593023,-0.497289,-0.5000,-0.736842,0.0,-0.50,0.0,-1.500,-0.923077,-0.588235,-0.571429,-0.25,-0.515796,-1.15,-0.593023
71230,3.482759,3.000000,-0.593023,-0.497289,-0.5000,-0.736842,0.0,-0.50,0.0,-1.500,-0.923077,-0.588235,-0.571429,3.75,-0.515796,-1.15,-0.593023
71231,1.655172,3.000000,2.011628,2.175254,2.1250,2.052632,0.0,2.25,0.0,1.000,1.538462,2.000000,2.057143,2.00,2.157340,1.45,2.011628
71232,-0.517241,-0.333333,-0.418605,-0.386536,-0.3750,-0.736842,0.0,-0.25,0.0,-0.625,-0.615385,-0.411765,-0.400000,-0.50,-0.394663,-0.60,-0.418605


In [77]:
test_X

Unnamed: 0,loc,iv(g),n,v,b,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,mean_bnv,mean_uniqOpOpend,mean_totOpOpend
0,-0.517241,-0.333333,-0.458824,-0.424258,-0.4375,-0.578947,0.0,-0.25,0.0,-0.750,-0.538462,-0.450980,-0.470588,-0.50,-0.431683,-0.65,-0.470588
1,0.068966,1.333333,1.164706,1.205913,1.1250,0.263158,0.0,0.25,0.0,0.125,1.307692,1.000000,1.411765,0.75,1.196126,0.90,1.152941
2,-0.379310,-0.333333,-0.164706,-0.168551,-0.1875,-0.315789,0.0,0.25,0.0,-0.125,-0.230769,-0.176471,-0.147059,-0.50,-0.170217,-0.20,-0.176471
3,-0.413793,0.000000,-0.352941,-0.326511,-0.3125,-0.368421,0.0,-0.25,0.0,-0.250,-0.538462,-0.313725,-0.382353,-0.25,-0.332743,-0.45,-0.352941
4,-0.034483,0.333333,-0.152941,-0.172211,-0.1875,0.157895,0.0,0.00,0.0,-0.250,-0.230769,-0.137255,-0.176471,0.00,-0.171478,-0.25,-0.164706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30524,1.482759,1.666667,1.388235,1.497379,1.4375,2.000000,4.0,1.50,0.0,0.375,1.000000,1.333333,1.470588,1.25,1.476813,0.80,1.376471
30525,-0.275862,0.000000,-0.058824,-0.052913,-0.0625,-0.157895,0.0,-0.25,0.0,0.250,0.000000,-0.098039,0.000000,0.25,-0.056185,0.10,-0.070588
30526,-0.517241,-0.333333,-0.352941,-0.339485,-0.3750,-0.473684,0.0,-0.50,0.0,-0.250,-0.615385,-0.333333,-0.441176,-0.50,-0.343705,-0.50,-0.388235
30527,0.275862,0.666667,0.423529,0.423024,0.3750,0.526316,0.0,2.75,0.0,0.250,0.461538,0.431373,0.529412,1.00,0.420354,0.40,0.458824


In [78]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(17, 128),
            nn.ReLU(),
            nn.Linear(128, 2)
        )        

    def forward(self, x):
        logits = self.layers(x)
        return logits

In [81]:
def train_loop(dataloader, model, swa_start, swa_model, scheduler, swa_scheduler, epoch, loss_fn, optimizer):
    # Modo treino
    model.train()
    # Tamanho do dataloader
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute predicition and loss
        prediction = model(X)
        loss = loss_fn(prediction, y)
        #Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if epoch >= swa_start :
            swa_model.update_parameters(model)
            swa_scheduler.step()
        else:
            scheduler.step()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1)*len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
            

def test_loop(dataloader, model, loss_fn):
    # Modo teste
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            prediction = model(X)
            test_loss += loss_fn(prediction, y).item()
            correct += (prediction.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {100*correct:>5f}%, Average loss: {test_loss:>8f}\n")

In [84]:
learning_rate = 5e-3
epochs = 100
loss_fn = nn.CrossEntropyLoss()
model = NeuralNetwork()
swa_start = 75
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingLR(optimizer, T_max=100)
swa_model = torch.optim.swa_utils.AveragedModel(model)
swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, \
                            anneal_strategy="linear", anneal_epochs=5, swa_lr=0.05)

for t in range(epochs):
    print(f"Epoch {t+1}\n--------------")
    train_loop(dataloader=train_dataloader, 
                model=model, 
                swa_start=0.75*epochs, 
                swa_model=swa_model, 
                scheduler=scheduler,
                swa_scheduler=swa_scheduler,
                epoch=t+1,
                loss_fn=loss_fn,
                optimizer=optimizer)
    test_loop(val_dataloader, model, loss_fn)
print("Done!")

Epoch 1
--------------
loss: 0.749115 [   64/71234]
loss: 0.623503 [ 6464/71234]
loss: 0.543396 [12864/71234]
loss: 0.500230 [19264/71234]
loss: 0.539079 [25664/71234]
loss: 0.602098 [32064/71234]
loss: 0.573343 [38464/71234]
loss: 0.523526 [44864/71234]
loss: 0.462964 [51264/71234]
loss: 0.489601 [57664/71234]
loss: 0.617571 [64064/71234]
loss: 0.479067 [70464/71234]
Test Error: 
 Accuracy: 79.488355%, Average loss: 0.479623

Epoch 2
--------------
loss: 0.491179 [   64/71234]
loss: 0.453836 [ 6464/71234]
loss: 0.388855 [12864/71234]
loss: 0.386886 [19264/71234]
loss: 0.466208 [25664/71234]
loss: 0.602798 [32064/71234]
loss: 0.557305 [38464/71234]
loss: 0.487310 [44864/71234]
loss: 0.422746 [51264/71234]
loss: 0.438472 [57664/71234]
loss: 0.568598 [64064/71234]
loss: 0.465579 [70464/71234]
Test Error: 
 Accuracy: 80.733073%, Average loss: 0.465248

Epoch 3
--------------
loss: 0.521109 [   64/71234]
loss: 0.429287 [ 6464/71234]
loss: 0.363859 [12864/71234]
loss: 0.354979 [19264/71234]