In [118]:
import os

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn import preprocessing

import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
from torchvision import transforms, utils

In [68]:
# Lendo os arquivos de base de dados
original_database = pd.read_csv('data/jm1.csv')
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [69]:
print('Duplicated Lines:')
train_data.duplicated().sum(), test_data.duplicated().sum()

Duplicated Lines:


(0, 0)

In [70]:
data = pd.concat([train_data, original_database], axis=0, ignore_index=True)

In [71]:
train_data
data_id = train_data.pop('id')

In [72]:
original_database

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,1.1,1.4,1.4,1.4,1.3,1.30,1.30,1.30,1.30,1.30,...,2,2,2,2,1.2,1.2,1.2,1.2,1.4,False
1,1.0,1.0,1.0,1.0,1.0,1.00,1.00,1.00,1.00,1.00,...,1,1,1,1,1,1,1,1,1,True
2,72.0,7.0,1.0,6.0,198.0,1134.13,0.05,20.31,55.85,23029.10,...,51,10,8,1,17,36,112,86,13,True
3,190.0,3.0,1.0,3.0,600.0,4348.76,0.06,17.06,254.87,74202.67,...,129,29,28,2,17,135,329,271,5,True
4,37.0,4.0,1.0,4.0,126.0,599.12,0.06,17.19,34.86,10297.30,...,28,1,6,0,11,16,76,50,7,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10880,18.0,4.0,1.0,4.0,52.0,241.48,0.14,7.33,32.93,1770.86,...,13,0,2,0,10,15,30,22,7,False
10881,9.0,2.0,1.0,2.0,30.0,129.66,0.12,8.25,15.72,1069.68,...,5,0,2,0,12,8,19,11,3,False
10882,42.0,4.0,1.0,2.0,103.0,519.57,0.04,26.40,19.68,13716.72,...,29,1,10,0,18,15,59,44,7,False
10883,10.0,1.0,1.0,1.0,36.0,147.15,0.12,8.44,17.44,1241.57,...,6,0,2,0,9,8,21,15,1,False


In [73]:
data

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,0.0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,...,17,1,1,0,16.0,9.0,38.0,22.0,5.0,False
1,1.0,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.00,21.11,...,11,0,1,0,11.0,11.0,18.0,14.0,3.0,False
2,2.0,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,...,8,0,1,0,12.0,11.0,28.0,17.0,3.0,False
3,3.0,8.0,1.0,1.0,1.0,23.0,94.01,0.19,5.25,17.86,...,4,0,2,0,8.0,6.0,16.0,7.0,1.0,True
4,4.0,11.0,2.0,1.0,2.0,17.0,60.94,0.18,5.63,12.44,...,7,0,2,0,7.0,6.0,10.0,10.0,3.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112643,,18.0,4.0,1.0,4.0,52.0,241.48,0.14,7.33,32.93,...,13,0,2,0,10,15,30,22,7,False
112644,,9.0,2.0,1.0,2.0,30.0,129.66,0.12,8.25,15.72,...,5,0,2,0,12,8,19,11,3,False
112645,,42.0,4.0,1.0,2.0,103.0,519.57,0.04,26.40,19.68,...,29,1,10,0,18,15,59,44,7,False
112646,,10.0,1.0,1.0,1.0,36.0,147.15,0.12,8.44,17.44,...,6,0,2,0,9,8,21,15,1,False


In [74]:
# Coluna de rótulos
label_name = 'defects'
# Transformação do rótulo para 0 e 1
data[label_name] = data[label_name].map({False: 0, True: 1})

In [75]:
# Função para substituir valores "descartáveis" por 'NaN'
def replace_with_nan(element):
    if type(element) == str and not element.isalnum():
        element = "NaN"
    return element

In [76]:
# Colunas que podem ter valores faltantes
fix_cols = ["uniq_Op", "uniq_Opnd", "total_Op", "total_Opnd", "branchCount"]

def fix_columns(df, cols):
    # Cópia do dataframe
    df_new = df.copy(deep = True)
    # Aplicar a função 'replace_with_nan' para cara elemento de uma coluna
    for col in cols:
        df_new[col] = df_new[col].apply(replace_with_nan).astype("float")
    return df_new

# Dataframe ajustado por 'replace_with_nan'
data_fixed = fix_columns(data, fix_cols)
data_fixed

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,0.0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,...,17,1,1,0,16.0,9.0,38.0,22.0,5.0,0
1,1.0,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.00,21.11,...,11,0,1,0,11.0,11.0,18.0,14.0,3.0,0
2,2.0,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,...,8,0,1,0,12.0,11.0,28.0,17.0,3.0,0
3,3.0,8.0,1.0,1.0,1.0,23.0,94.01,0.19,5.25,17.86,...,4,0,2,0,8.0,6.0,16.0,7.0,1.0,1
4,4.0,11.0,2.0,1.0,2.0,17.0,60.94,0.18,5.63,12.44,...,7,0,2,0,7.0,6.0,10.0,10.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112643,,18.0,4.0,1.0,4.0,52.0,241.48,0.14,7.33,32.93,...,13,0,2,0,10.0,15.0,30.0,22.0,7.0,0
112644,,9.0,2.0,1.0,2.0,30.0,129.66,0.12,8.25,15.72,...,5,0,2,0,12.0,8.0,19.0,11.0,3.0,0
112645,,42.0,4.0,1.0,2.0,103.0,519.57,0.04,26.40,19.68,...,29,1,10,0,18.0,15.0,59.0,44.0,7.0,0
112646,,10.0,1.0,1.0,1.0,36.0,147.15,0.12,8.44,17.44,...,6,0,2,0,9.0,8.0,21.0,15.0,1.0,0


In [77]:
data_fixed_drop = data_fixed.dropna() # Remover valores nulos
data_fixed_drop = data_fixed_drop.drop(labels='id',axis=1) # Remover coluna 'id'
print(data_fixed_drop.isna().sum()) # Verificar os nulos

loc                  0
v(g)                 0
ev(g)                0
iv(g)                0
n                    0
v                    0
l                    0
d                    0
i                    0
e                    0
b                    0
t                    0
lOCode               0
lOComment            0
lOBlank              0
locCodeAndComment    0
uniq_Op              0
uniq_Opnd            0
total_Op             0
total_Opnd           0
branchCount          0
defects              0
dtype: int64


In [86]:
# Remover alguns atributos
drop_cols = ["v(g)", "ev(g)", "l", "d", "i", "e", "t"] 
def drop_columns(df, cols):
    df_new = df.copy(deep=True)
    df_new = df_new.drop(labels=cols, axis=1)
    return df_new
# Dataframe ajustado e com menos atributos
data_fixed_drop = drop_columns(data_fixed_drop, drop_cols)
data_fixed_drop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 101763 entries, 0 to 101762
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   loc                101763 non-null  float64
 1   iv(g)              101763 non-null  float64
 2   n                  101763 non-null  float64
 3   v                  101763 non-null  float64
 4   b                  101763 non-null  float64
 5   lOCode             101763 non-null  int64  
 6   lOComment          101763 non-null  int64  
 7   lOBlank            101763 non-null  int64  
 8   locCodeAndComment  101763 non-null  int64  
 9   uniq_Op            101763 non-null  object 
 10  uniq_Opnd          101763 non-null  object 
 11  total_Op           101763 non-null  object 
 12  total_Opnd         101763 non-null  object 
 13  branchCount        101763 non-null  object 
 14  defects            101763 non-null  int64  
dtypes: float64(5), int64(5), object(5)
memory usage: 12.4+ M

[pandas.Dataframe.apply](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html): Aplica uma função ao longo de um eixo do dataframe.

`df.apply(function, axis=0)`

`axis`: {0 or 'index', 1 or 'column'}, default 0

- 0: Aplica a função para cara coluna
- 1: Aplica a função para cara 

`df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])`

``df.apply(np.sqrt)``

Retirada de valores nulos e da coluna 'id'

In [112]:
data_fixed_drop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 101763 entries, 0 to 101762
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   loc                101763 non-null  float64
 1   iv(g)              101763 non-null  float64
 2   n                  101763 non-null  float64
 3   v                  101763 non-null  float64
 4   b                  101763 non-null  float64
 5   lOCode             101763 non-null  int64  
 6   lOComment          101763 non-null  int64  
 7   lOBlank            101763 non-null  int64  
 8   locCodeAndComment  101763 non-null  int64  
 9   uniq_Op            101763 non-null  object 
 10  uniq_Opnd          101763 non-null  object 
 11  total_Op           101763 non-null  object 
 12  total_Opnd         101763 non-null  object 
 13  branchCount        101763 non-null  object 
 14  defects            101763 non-null  int64  
dtypes: float64(5), int64(5), object(5)
memory usage: 12.4+ M

In [99]:
targets = 'defects'
train_set, test_set = train_test_split(data_fixed_drop,  
                                                    test_size=0.3,
                                                    random_state=30)
train_y = train_set['defects'].copy()
test_y = test_set['defects'].copy()
train_X = train_set.drop(labels='defects', axis=1)
test_X = test_set.drop(['defects'], axis=1)


Pré-processamento de escala dos atributos

In [84]:
def add_feat(X):
    df=X.copy()
    df['mean_bnv']         = (df['n'] + df['v'] + df['b']) /3;
    df['mean_uniqOpOpend'] = (df['uniq_Op'] + df['uniq_Opnd']) /2;
    df['mean_totOpOpend']  = (df['total_Op'] + df['total_Opnd']) /2;
    return df
train_X = add_feat(train_X)
test_data = add_feat(test_data)

In [113]:
# Faz o ajuste de escala e transforma todos os atributos para float
def scale(df):
    scaler = preprocessing.RobustScaler()
    robust_df = scaler.fit_transform(df)
    robust_df = pd.DataFrame(robust_df, columns =df.columns)
    return robust_df
train_X = scale(train_X)
test_data =  scale(test_data)
test_X = scale(test_X)
test_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30529 entries, 0 to 30528
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   loc                30529 non-null  float64
 1   iv(g)              30529 non-null  float64
 2   n                  30529 non-null  float64
 3   v                  30529 non-null  float64
 4   b                  30529 non-null  float64
 5   lOCode             30529 non-null  float64
 6   lOComment          30529 non-null  float64
 7   lOBlank            30529 non-null  float64
 8   locCodeAndComment  30529 non-null  float64
 9   uniq_Op            30529 non-null  float64
 10  uniq_Opnd          30529 non-null  float64
 11  total_Op           30529 non-null  float64
 12  total_Opnd         30529 non-null  float64
 13  branchCount        30529 non-null  float64
dtypes: float64(14)
memory usage: 3.3 MB


In [111]:
test_X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30529 entries, 58168 to 6806
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   loc                30529 non-null  float64
 1   iv(g)              30529 non-null  float64
 2   n                  30529 non-null  float64
 3   v                  30529 non-null  float64
 4   b                  30529 non-null  float64
 5   lOCode             30529 non-null  int64  
 6   lOComment          30529 non-null  int64  
 7   lOBlank            30529 non-null  int64  
 8   locCodeAndComment  30529 non-null  int64  
 9   uniq_Op            30529 non-null  object 
 10  uniq_Opnd          30529 non-null  object 
 11  total_Op           30529 non-null  object 
 12  total_Opnd         30529 non-null  object 
 13  branchCount        30529 non-null  object 
dtypes: float64(5), int64(4), object(5)
memory usage: 3.5+ MB


Tentando transformar em tensor

In [109]:
# Depois
train_X_tensor = torch.FloatTensor(train_X.values)
train_y_tensor = torch.FloatTensor(train_y.values)

In [114]:
test_X_tensor = torch.FloatTensor(test_X.values)
test_y_tensor = torch.FloatTensor(test_y.values)

In [119]:
train_dataset = TensorDataset(train_X_tensor, train_y_tensor)
val_dataset = TensorDataset(test_X_tensor, test_y_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=128)
val_dataloader = DataLoader(val_dataset, batch_size=128)

# Referências


- [Kaggle: Software Defect Prediction](https://www.kaggle.com/datasets/semustafacevik/software-defect-prediction/code)

- [PS S3 E23 Feature Selection and Ensemble](https://www.kaggle.com/code/liudacheldieva/ps-s3-e23-feature-selection-and-ensemble)

- [Solving sklearn datasets with PyTorch](https://www.kaggle.com/code/glebbuzin/solving-sklearn-datasets-with-pytorch)

- [Binary Classification with a Software Defects Data](https://www.kaggle.com/code/imessam/binary-classification-with-a-software-defects-data)

- [Playground Series - Season 3, Episode 23](https://www.kaggle.com/code/jpedrou/playground-series-season-3-episode-23/notebook)