In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/emodel/best_model.pt
/kaggle/input/mmodel/issues_test.csv


In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split

import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

MAX_LEN = 250
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
EPOCHS = 4
LEARNING_RATE = 1e-06 * 5
# https://stackoverflow.com/questions/65082243/dropout-argument-input-position-1-must-be-tensor-not-str-when-using-bert
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', return_dict=False)
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.title = dataframe['data']
        self.targets = self.data.target_list
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [3]:
import pandas as pd
import numpy as np

def add_collum(src_name, col_name, frame):
  frame[col_name] = np.where(frame[src_name] == col_name, 1, 0)
  return frame

def preprocessing(csv_name, row_data):
    df = pd.read_csv(csv_name)
    
     
    
    for item in df.repo.unique():
        df = add_collum('repo', item, df)
    for item in df.label.unique():
        df = add_collum('label', item, df)
    df['data'] = df.title + ' ' + df.body

    #ndf = df.drop(['title', 'body', 'repo', 'label'], axis=1)
    ndf = df.drop(['title', 'body'], axis=1)
    ndf['target_list'] = ndf[['bug', 'feature', 'question',
                          'facebook/react', 'tensorflow/tensorflow',
                          'microsoft/vscode', 'bitcoin/bitcoin',
                          'opencv/opencv']].values.tolist()
    ndf = ndf[ndf['label'] == row_data[0]]
    ndf = ndf[ndf['repo'] == row_data[1]] 
    #ndf = ndf.reset_index(drop=True)
    
    df2 = ndf.drop(['bug', 'feature', 'question',
                          'facebook/react', 'tensorflow/tensorflow',
                          'microsoft/vscode', 'bitcoin/bitcoin',
                          'opencv/opencv', 'repo', 'label'], axis=1)
    df2 = df2.reset_index(drop=True)
    
    return df2

In [4]:
def get_validation_dataset(file_path, row_data):
    
    valid_dataset = preprocessing(file_path, row_data)
    
    
    validation_set = CustomDataset(valid_dataset, tokenizer, MAX_LEN)
    test_params = {'batch_size': len(valid_dataset),
                    'shuffle': False,
                    'num_workers': 0
                    }

    validation_loader = DataLoader(validation_set, **test_params)
    
    return validation_loader

def get_result(model, file_path, row_name):
    
    validation_loader = get_validation_dataset(file_path, row_name)
    model.eval()
    acc = 0
    prf = None
    with torch.no_grad():
          for batch_idx, data in enumerate(validation_loader, 0):
                ids = data['ids'].to(device, dtype = torch.long)
                mask = data['mask'].to(device, dtype = torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.float)
                outputs = model(ids, mask, token_type_ids)
                #print(outputs)
                #print(targets)
                out = torch.tensor([[1 if i > 0 else 0 for i in line] for line in outputs])
                exp = targets.detach().cpu()
                #print(accuracy_score(exp, out))
                acc = accuracy_score(out, exp)
                prf = precision_recall_fscore_support(out, exp, average='micro')
    return acc, prf



In [5]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 8)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss
    return model, optimizer, checkpoint['epoch'], valid_loss_min

model = BERTClass()
model = torch.nn.DataParallel(model, device_ids = [0,1]).to(device)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
model, optimizer, start_epoch, valid_loss_min = load_ckp('/kaggle/input/emodel/best_model.pt', model, optimizer)
# model.to(device)
#model = torch.nn.DataParallel(model, device_ids = [0,1]).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  checkpoint = torch.load(checkpoint_fpath)


In [9]:
file_path = '/kaggle/input/mmodel/issues_test.csv'
labels = ['bug', 'feature', 'question']
repos = ['facebook/react', 'tensorflow/tensorflow', 'microsoft/vscode',
         'bitcoin/bitcoin', 'opencv/opencv']

for label in labels:
    for repo in repos:
        row_data = [label, repo]
        result = get_result(model, file_path, row_data)
        print(f'{label} & {repo} & {result[0] * 100 :.0f}\% & {result[1][0]:.4f}  & {result[1][1]:.4f}  & {result[1][2]:.4f} \\\\ \\hline')
        pass    
    pass

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


bug & facebook/react & 95\% & 0.9700  & 0.9749  & 0.9724 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


bug & tensorflow/tensorflow & 90\% & 0.9450  & 0.9545  & 0.9497 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


bug & microsoft/vscode & 57\% & 0.7800  & 0.8211  & 0.8000 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


bug & bitcoin/bitcoin & 60\% & 0.7950  & 0.8281  & 0.8112 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


bug & opencv/opencv & 69\% & 0.8450  & 0.8756  & 0.8601 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


feature & facebook/react & 78\% & 0.8900  & 0.9082  & 0.8990 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


feature & tensorflow/tensorflow & 80\% & 0.9000  & 0.9091  & 0.9045 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


feature & microsoft/vscode & 81\% & 0.8900  & 0.9223  & 0.9059 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


feature & bitcoin/bitcoin & 81\% & 0.9050  & 0.9235  & 0.9141 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


feature & opencv/opencv & 80\% & 0.9000  & 0.9091  & 0.9045 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


question & facebook/react & 58\% & 0.7900  & 0.8144  & 0.8020 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


question & tensorflow/tensorflow & 85\% & 0.9150  & 0.9482  & 0.9313 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


question & microsoft/vscode & 71\% & 0.8400  & 0.8660  & 0.8528 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


question & bitcoin/bitcoin & 48\% & 0.7300  & 0.7644  & 0.7468 \\ \hline


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


question & opencv/opencv & 60\% & 0.8000  & 0.8466  & 0.8226 \\ \hline
