In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
sys.path.append("../")
import src.data_utils as d_u
import src.feats_generation as f_g
import src.eval_utils as e_u

In [None]:
df = d_u.get_preprocess_data()

X = df.drop(columns=["commit_message", "bugs"])
y = df.bugs

msg_embs = f_g.pretrained_model_sentence_emb(df.commit_message.values)

X = np.concatenate((X.to_numpy(), msg_embs), axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [None]:
from sklearn.neural_network import MLPRegressor

best_config = {'activation': 'relu', 'hidden_layer_sizes': (300, 100), 'learning_rate': 'constant', 'max_iter': 10000, 'random_state': 42, 'solver': 'sgd'}

model_sk = MLPRegressor(**best_config)

model_sk.fit(X_train, y_train)
preds = model_sk.predict(X_test)

print(mean_squared_error(y_test, preds))
print(r2_score(y_test, preds))

1.0778253074522304
0.5805107049565632


In [None]:
X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.to_numpy().astype(np.float32))
y_test = torch.from_numpy(y_test.to_numpy().astype(np.float32))

In [None]:
y_train = y_train.view(y_train.shape[0], 1)
y_test = y_test.view(y_test.shape[0], 1)

In [None]:
_, n_features = X_train.shape

In [None]:
input_size = n_features
output_size = 1

model = nn.Sequential(
    nn.Linear(input_size, 300),
    nn.ReLU(),
    nn.Linear(300, 100),
    nn.ReLU(),
    nn.Linear(100, 1)
    )

criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, weight_decay=0.0001)

num_epochs = 12000

for epoch in tqdm(range(num_epochs)):
    
    optimizer.zero_grad()
    y_predicted = model(X_train)
    loss = criterion(y_predicted, y_train)
    loss.backward()
    optimizer.step()

100%|██████████| 12000/12000 [00:41<00:00, 288.02it/s]


In [None]:
model.eval()

y_pred = model(X_test)

print(criterion(y_pred, y_test))

tensor(1.0467, grad_fn=<MseLossBackward0>)


In [None]:
r2_score(y_test.squeeze().detach().numpy(), y_pred.squeeze().detach().numpy())

0.5609809924712574

# Попробуем теперь подход с эмбеддингами для категориальных

In [None]:
df = pd.read_csv("../data/raw/АВСОФТ_тест_ML_приложение.csv")

df.drop(columns=["commit_hash"], inplace=True)

In [None]:
df.head()

Unnamed: 0,repository_name,commit_date,commit_author,commit_message,bugs
0,mlm,2020-06-12T14:26:12,Victor,объяснения предсказаний с помощью lime,4
1,mlm,2020-06-12T11:53:10,Victor,Поставил lime вместо eli5,3
2,mlm,2020-06-11T15:19:03,Victor,Скрипт для создния пакетов обновления,4
3,mlm,2020-06-11T10:42:29,Victor,Сортировка весов по убыванию,3
4,mlm,2020-06-11T09:25:19,Victor,не надо конвертировать массив до операции по с...,7


In [None]:
df.repository_name = LabelEncoder().fit_transform(df.repository_name)

In [None]:
df.commit_author = LabelEncoder().fit_transform(df.commit_author)

In [None]:
df.commit_author.value_counts()

8    134
5     67
9     48
3     42
7      9
4      9
2      4
6      2
1      2
0      2
Name: commit_author, dtype: int64

In [None]:
def to_4(x):
    if x == 8:
        return 3
    elif x in [5, 9, 3]:
        return 2
    elif x in [7, 4]:
        return 1
    else:
        return 0

df.commit_author = df.commit_author.apply(to_4)

In [None]:
df.commit_author.value_counts()

2    157
3    134
1     18
0     10
Name: commit_author, dtype: int64

In [None]:
# https://forums.fast.ai/t/size-of-embedding-for-categorical-variables/42608
def compute_embed_dim(n_cat: int) -> int:
    return min(600, round(1.6 * n_cat**0.56))

In [None]:
df

Unnamed: 0,repository_name,commit_date,commit_author,commit_message,bugs
0,3,2020-06-12T14:26:12,3,объяснения предсказаний с помощью lime,4
1,3,2020-06-12T11:53:10,3,Поставил lime вместо eli5,3
2,3,2020-06-11T15:19:03,3,Скрипт для создния пакетов обновления,4
3,3,2020-06-11T10:42:29,3,Сортировка весов по убыванию,3
4,3,2020-06-11T09:25:19,3,не надо конвертировать массив до операции по с...,7
...,...,...,...,...,...
314,4,2020-03-23T12:54:35,2,setting hostname,1
315,4,2020-03-23T12:30:52,2,heralding default dockerfile,2
316,4,2020-03-23T11:27:40,2,unnesserity /data path permissions setting,3
317,4,2020-03-23T11:20:50,2,added trapconductor,1


In [None]:
embeds_info = {col : (len(df[col].unique()), compute_embed_dim(len(df[col].unique()))) for col in ["repository_name", "commit_author"]}

In [None]:
embeds_info

{'repository_name': (6, 4), 'commit_author': (4, 3)}

In [None]:
df.commit_date = pd.to_datetime(df.commit_date)

df["day"] = df.commit_date.apply(lambda x: x.weekday())
df["day"] = df["day"].apply(lambda x: 1 if x in list(range(0, 5)) else 0)

df["hour"] = df.commit_date.apply(lambda x: x.hour)
df["hour"] = df["hour"].apply(lambda x: 1 if x in list(range(8, 19)) else 0)

In [None]:
df.head()

Unnamed: 0,repository_name,commit_date,commit_author,commit_message,bugs,day,hour
0,3,2020-06-12 14:26:12,3,объяснения предсказаний с помощью lime,4,1,1
1,3,2020-06-12 11:53:10,3,Поставил lime вместо eli5,3,1,1
2,3,2020-06-11 15:19:03,3,Скрипт для создния пакетов обновления,4,1,1
3,3,2020-06-11 10:42:29,3,Сортировка весов по убыванию,3,1,1
4,3,2020-06-11 09:25:19,3,не надо конвертировать массив до операции по с...,7,1,1


In [None]:
X_idx_tab_emb = df[["repository_name", "commit_author"]]
X_cont = df.drop(columns=["commit_message", "bugs", "repository_name", "commit_author", "commit_date"])

y = df["bugs"]

In [None]:
X_cont.head()

Unnamed: 0,day,hour
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1


In [None]:
temp_train, temp_test, _, _ = train_test_split(X_cont, y, random_state=42)

train_idxs, test_idxs = temp_train.index, temp_test.index 

In [None]:
msg_embs = f_g.pretrained_model_sentence_emb(df.commit_message.values)

X_cont = np.concatenate((X_cont.to_numpy(), msg_embs), axis=1)

In [None]:
X_cont_train, X_cont_test = X_cont[train_idxs], X_cont[test_idxs]

In [None]:
y_train, y_test = y[train_idxs], y[test_idxs]

In [None]:
X_idx_tab_emb_train = X_idx_tab_emb.loc[train_idxs]
X_idx_tab_emb_test = X_idx_tab_emb.loc[test_idxs]

In [None]:
X_cont_train.shape

(239, 514)

In [None]:
X_idx_tab_emb_train = torch.from_numpy(X_idx_tab_emb_train.to_numpy())
y_train = torch.from_numpy(y_train.to_numpy().astype(np.float32)).view(y_train.shape[0], 1)
X_cont_train = torch.from_numpy(X_cont_train.astype(np.float32))

X_idx_tab_emb_test = torch.from_numpy(X_idx_tab_emb_test.to_numpy())
y_test = torch.from_numpy(y_test.to_numpy().astype(np.float32)).view(y_test.shape[0], 1)
X_cont_test = torch.from_numpy(X_cont_test.astype(np.float32))

In [None]:
class MyModel(nn.Module):
    def __init__(self, embedding_sizes):
        super().__init__()
        self.embs = nn.ModuleList([
            nn.Embedding(embedding_sizes["repository_name"][0], embedding_sizes["repository_name"][1]),
            nn.Embedding(embedding_sizes["commit_author"][0], embedding_sizes["commit_author"][1])
        ])
        
        self.lin1 = nn.Linear(514 + 7, 300)
        self.lin2 = nn.Linear(300, 100)
        self.lin3 = nn.Linear(100, 1)
    
    def forward(self, x_cat, x_cont):
        x = [emb(x_cat[:,i]) for i, emb in enumerate(self.embs)]
        x = torch.cat(x, 1)
        x = torch.cat([x, x_cont], 1)
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        return self.lin3(x)


In [None]:
model = MyModel(embeds_info)

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, weight_decay=0.0001)

num_epochs = 15000

model.train()

for epoch in tqdm(range(num_epochs)):
    optimizer.zero_grad()
    y_predicted = model(X_idx_tab_emb_train, X_cont_train)
    loss = criterion(y_predicted, y_train)
    loss.backward()
    optimizer.step()

100%|██████████| 15000/15000 [01:05<00:00, 229.55it/s]


In [None]:
model.eval()

y_pred = model(X_idx_tab_emb_test, X_cont_test)

print(criterion(y_pred, y_test))

tensor(1.0684, grad_fn=<MseLossBackward0>)
