<a href="https://colab.research.google.com/github/kargaranamir/issue-tagger/blob/main/TFIDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Download

In [None]:
!wget https://machinehack-be.s3.amazonaws.com/predict_github_issues_embold_sponsored_hackathon/Embold_Participant%27s_Dataset.zip -O data.zip
!unzip ./data.zip 
! mv ./Embold_Participant\'s_Dataset ./data
! rm -rf ./data/sample\ submission.csv
! rm -rf ./data/embold_test.json

## Import Libraries

In [75]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC



import re

from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import string

from tqdm.notebook import tqdm
tqdm.pandas()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Data

In [5]:
# merge data
data_small_df = pd.read_json('./data/embold_train.json').reset_index(drop=True)
data_large_df = pd.read_json('./data/embold_train_extra.json').reset_index(drop=True)
data_df = data_small_df.append(data_large_df)
data_df['text'] = data_df['title']+' '+data_df['body']
data_df['text_length'] = data_df['text'].apply(lambda text_input: len(text_input.split()))

In [6]:
# heaf of data
data_df.head()

Unnamed: 0,title,body,label,text,text_length
0,y-zoom piano roll,a y-zoom on the piano roll would be useful.,1,y-zoom piano roll a y-zoom on the piano roll w...,12
1,buggy behavior in selection,! screenshot from 2016-02-23 21 27 40 https:/...,0,buggy behavior in selection ! screenshot from ...,13
2,auto update feature,"hi,\r \r great job so far, @saenzramiro ! : \r...",1,"auto update feature hi,\r \r great job so far,...",35
3,filter out noisy endpoints in logs,i think we should stop logging requests to:\r ...,1,filter out noisy endpoints in logs i think we ...,23
4,enable pid on / pid off alarm actions for ardu...,expected behavior\r alarm actions pid on and p...,0,enable pid on / pid off alarm actions for ardu...,291


In [16]:
stopwords_list = stopwords.words('english')

def clean_text(text, lowercase=True, stop_words=True, links=True, numbers=True):
    text = text.replace("\\r", "")
    if lowercase:
        text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    if links:
        text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    if numbers:
       text = re.sub('\w*\d\w*', '', text)
    if stop_words:
        text = " ".join([word for word in text.split() if word not in stopwords_list])
    return text

In [17]:
data_df['text_clean'] = data_df['text'].progress_apply(lambda text: clean_text(text))

  0%|          | 0/450000 [00:00<?, ?it/s]

### Split Data

In [19]:
label_encoder = LabelEncoder()

X = data_df['text_clean'].values
y = label_encoder.fit_transform(data_df['label'])

In [22]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2


## TFIDF

In [64]:
vectorizer = TfidfVectorizer(min_df=1, ngram_range = (1,1), max_features=12000)
train_data_features = vectorizer.fit_transform(X_train)
print(train_data_features.shape)

(270000, 12000)


In [65]:
eval_data_features = vectorizer.transform(X_val)
test_data_features = vectorizer.transform(X_test)

In [66]:
vectorizer.get_feature_names()[200:210]



['aegir',
 'aem',
 'aes',
 'aesthetic',
 'af',
 'afaict',
 'afaik',
 'affect',
 'affected',
 'affecting']

## Model

In [68]:
def analysis(labels, predictions):
    print("Report Classification: \n", classification_report(labels, predictions, target_names=['Bug', 'Feature', 'Question']))
    print("Matrix Confusion: \n", confusion_matrix(labels, predictions))
    print("Accuracy: \n", accuracy_score(labels,predictions))

### Logistic Regression

In [73]:
### logistic regression

grid_values = {'penalty': ['l2'], 'C': [1,10]}
clf = GridSearchCV(LogisticRegression(random_state=0, solver='liblinear', max_iter=1000, multi_class='ovr'), param_grid=grid_values, scoring = 'f1_micro')

clf.fit(train_data_features, y_train)

print("-------LR-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)


-------LR-------:
Best parameters set found on development set:
{'C': 1, 'penalty': 'l2'}
LogisticRegression(C=1, max_iter=1000, multi_class='ovr', random_state=0,
                   solver='liblinear')
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.78      0.83      0.80     40165
     Feature       0.78      0.83      0.80     41329
    Question       0.63      0.22      0.32      8506

    accuracy                           0.77     90000
   macro avg       0.73      0.62      0.64     90000
weighted avg       0.76      0.77      0.76     90000

Matrix Confusion: 
 [[33357  6332   476]
 [ 6527 34173   629]
 [ 3119  3536  1851]]
Accuracy: 
 0.7709


### KNN

In [None]:
### knn
k_range = list(range(1,31))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)
knn = KNeighborsClassifier()

clf = GridSearchCV(knn, param_grid, scoring ='f1_micro')
clf.fit(train_data_features, y_train)

print("-------KNN-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)


### SVM 

In [None]:
### svm 
tuned_parameters = [{'kernel': ['rbf'], 'C': [1, 10]}]

clf = GridSearchCV(SVC(), tuned_parameters, scoring ='f1_micro')
clf.fit(train_data_features, y_train)

print("-------SVM-------:")
print("Best parameters set found on development set:")
print(clf.best_params_)
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)
