<a href="https://colab.research.google.com/github/kargaranamir/issue-tagger/blob/main/TFIDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data and Start

## Data Download

In [None]:
! wget https://machinehack-be.s3.amazonaws.com/predict_github_issues_embold_sponsored_hackathon/Embold_Participant%27s_Dataset.zip -O data.zip
! unzip ./data.zip 
! mv ./Embold_Participant\'s_Dataset ./data
! rm -rf ./data/sample\ submission.csv
! rm -rf ./data/embold_test.json

## Import Libraries

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC



import re

from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline

import nltk

from nltk.corpus import stopwords
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import string

from tqdm.notebook import tqdm
tqdm.pandas()


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

## Load Data

In [6]:
# merge data
data_small_df = pd.read_json('./data/embold_train.json').reset_index(drop=True)
data_large_df = pd.read_json('./data/embold_train_extra.json').reset_index(drop=True)
data_df = data_small_df.append(data_large_df)
data_df['text'] = data_df['title']+' '+data_df['body']
data_df['text_length'] = data_df['text'].apply(lambda text_input: len(text_input.split()))

In [7]:
# heaf of data
data_df.head()

Unnamed: 0,title,body,label,text,text_length
0,y-zoom piano roll,a y-zoom on the piano roll would be useful.,1,y-zoom piano roll a y-zoom on the piano roll w...,12
1,buggy behavior in selection,! screenshot from 2016-02-23 21 27 40 https:/...,0,buggy behavior in selection ! screenshot from ...,13
2,auto update feature,"hi,\r \r great job so far, @saenzramiro ! : \r...",1,"auto update feature hi,\r \r great job so far,...",35
3,filter out noisy endpoints in logs,i think we should stop logging requests to:\r ...,1,filter out noisy endpoints in logs i think we ...,23
4,enable pid on / pid off alarm actions for ardu...,expected behavior\r alarm actions pid on and p...,0,enable pid on / pid off alarm actions for ardu...,291


In [24]:
stopwords_list = stopwords.words('english')

def lemmatize(text):
    list_pos = 0
    cleaned_str = ''
    lmtzr = WordNetLemmatizer()
    
    tagged_words = pos_tag(text)
    for word in tagged_words:
        if 'v' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='v')
        else:
            lemma = lmtzr.lemmatize(word[0], pos='n')
        if list_pos == 0:
            cleaned_str = lemma
        else:
            cleaned_str = cleaned_str + ' ' + lemma
        list_pos += 1
    return cleaned_str


def clean_text(text, lowercase=True, stop_words=True, lemmatization=True):
    text = text.replace("\\r", "")
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)

    if lowercase:
        text = text.lower()

    text = re.sub(r'\bid\b', 'i would', text) #start abbreviation
    text = re.sub(r'\bive\b', 'i have', text)
    text = re.sub(r'\bim\b', 'i am', text)
    text = re.sub(r'\bcant\b', 'can not', text)
    text = re.sub(r'\bdont\b', 'do not', text)
    text = re.sub(r'\bwont\b', 'will not', text)
    text = re.sub(r'\bthats\b', 'that is', text) #end abbreviation

    text = re.sub('https?://\S+|www\.\S+', '', text) #remove links    
    text = re.sub(r'[^\x00-\x7F]+',' ', text) #remove non-ascii

    if lemmatization:
        word_list = nltk.word_tokenize(text)
        text = lemmatize(word_list)

    if stop_words:
        text = " ".join([word for word in text.split() if word not in stopwords_list])

    text = re.sub('\w*\d\w*', '', text) #numbers in words
    text = re.sub('[0-9]+', '', text) # delete numbers
    return text

In [28]:
# example
clean_text("Alex goes to google and see a bad person")

'Alex go google see bad person'

In [29]:
data_df['text_clean_lowercase_stopwords_lemmatize'] = data_df['text'].progress_apply(lambda text: clean_text(text))
data_df['text_clean_lowercase_stopwords'] = data_df['text'].progress_apply(lambda text: clean_text(text, lemmatization=False))
data_df['text_clean_lowercase'] = data_df['text'].progress_apply(lambda text: clean_text(text, lemmatization=False, stop_words=False))
data_df['text_clean'] = data_df['text'].progress_apply(lambda text: clean_text(text, lemmatization=False, stop_words=False, lowercase=False))


  0%|          | 0/450000 [00:00<?, ?it/s]

  0%|          | 0/450000 [00:00<?, ?it/s]

  0%|          | 0/450000 [00:00<?, ?it/s]

  0%|          | 0/450000 [00:00<?, ?it/s]

# Methods

In [30]:
def analysis(labels, predictions):
    print("Report Classification: \n", classification_report(labels, predictions, target_names=['Bug', 'Feature', 'Question']))
    print("Matrix Confusion: \n", confusion_matrix(labels, predictions))
    print("Accuracy: \n", accuracy_score(labels,predictions))

## Lower Case + Stop Words + Lemmatization

### Split Data

In [31]:
label_encoder = LabelEncoder()

X = data_df['text_clean_lowercase_stopwords_lemmatize'].values
y = label_encoder.fit_transform(data_df['label'])

In [32]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=1) # 0.125 x 0.8 = 0.1


### TFIDF

In [33]:
vectorizer = TfidfVectorizer(min_df=1, ngram_range = (1,1), max_features=12000)
train_data_features = vectorizer.fit_transform(X_train)
print(train_data_features.shape)

(315000, 12000)


In [34]:
val_data_features = vectorizer.transform(X_val)
test_data_features = vectorizer.transform(X_test)
vectorizer.get_feature_names()[200:210]



['aframe',
 'afternoon',
 'afterwards',
 'ag',
 'agda',
 'age',
 'agency',
 'agenda',
 'agent',
 'agg']

### Model

#### Logistic Regression

In [36]:
### logistic regression

clf = LogisticRegression(C=1, penalty = 'l2', random_state=0, solver='liblinear', max_iter=1000, multi_class='ovr')

clf.fit(train_data_features, y_train)

print("-------LR-------:")
print("-------C=1-------")
print("-------Validation-------:")
Y_val_pred = clf.predict(val_data_features)
analysis(y_val, Y_val_pred)

print("-------Test-------:")
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)


-------LR-------:
-------C=1-------
-------Validation-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.77      0.82      0.79     20121
     Feature       0.76      0.83      0.79     20684
    Question       0.64      0.21      0.31      4195

    accuracy                           0.76     45000
   macro avg       0.72      0.62      0.63     45000
weighted avg       0.76      0.76      0.75     45000

Matrix Confusion: 
 [[16423  3478   220]
 [ 3330 17079   275]
 [ 1491  1839   865]]
Accuracy: 
 0.7637111111111111
-------Test-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.77      0.82      0.79     40165
     Feature       0.76      0.82      0.79     41329
    Question       0.63      0.20      0.31      8506

    accuracy                           0.76     90000
   macro avg       0.72      0.61      0.63     90000
weighted avg       0.75      0.76      0.75   

#### SVM 

In [37]:
### svm 
clf = SVC(C=1, kernel= 'rbf', max_iter = 1000)
clf.fit(train_data_features, y_train)

print("-------SVM-------:")
print("-------kernel=rbf-------")
print("-------Validation-------:")
Y_val_pred = clf.predict(val_data_features)
analysis(y_val, Y_val_pred)

print("-------Test-------:")
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)




-------SVM-------:
-------kernel=rbf-------
-------Validation-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.57      0.67      0.61     20121
     Feature       0.62      0.51      0.56     20684
    Question       0.21      0.22      0.21      4195

    accuracy                           0.55     45000
   macro avg       0.47      0.46      0.46     45000
weighted avg       0.56      0.55      0.55     45000

Matrix Confusion: 
 [[13433  5179  1509]
 [ 8267 10516  1901]
 [ 2000  1281   914]]
Accuracy: 
 0.5525111111111111
-------Test-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.57      0.67      0.61     40165
     Feature       0.62      0.51      0.56     41329
    Question       0.21      0.22      0.22      8506

    accuracy                           0.55     90000
   macro avg       0.47      0.47      0.46     90000
weighted avg       0.56      0.55     

In [38]:
### svm 
clf = SVC(C=1, kernel= 'linear', max_iter = 1000)
clf.fit(train_data_features, y_train)

print("-------SVM-------:")
print("-------kernel=linear-------")
print("-------Validation-------:")
Y_val_pred = clf.predict(val_data_features)
analysis(y_val, Y_val_pred)

print("-------Test-------:")
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)




-------SVM-------:
-------kernel=linear-------
-------Validation-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.54      0.57      0.56     20121
     Feature       0.57      0.46      0.51     20684
    Question       0.16      0.27      0.20      4195

    accuracy                           0.49     45000
   macro avg       0.42      0.43      0.42     45000
weighted avg       0.52      0.49      0.50     45000

Matrix Confusion: 
 [[11535  5899  2687]
 [ 8101  9445  3138]
 [ 1781  1302  1112]]
Accuracy: 
 0.49093333333333333
-------Test-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.54      0.57      0.56     40165
     Feature       0.56      0.45      0.50     41329
    Question       0.16      0.26      0.20      8506

    accuracy                           0.49     90000
   macro avg       0.42      0.43      0.42     90000
weighted avg       0.51      0.49 

In [40]:
### svm 
clf = SVC(C=1, kernel= 'poly', max_iter = 1000)
clf.fit(train_data_features, y_train)

print("-------SVM-------:")
print("-------kernel=poly-------")
print("-------Validation-------:")
Y_val_pred = clf.predict(val_data_features)
analysis(y_val, Y_val_pred)

print("-------Test-------:")
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)




-------SVM-------:
-------kernel=poly-------
-------Validation-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.47      0.92      0.63     20121
     Feature       0.70      0.16      0.26     20684
    Question       0.22      0.06      0.10      4195

    accuracy                           0.49     45000
   macro avg       0.46      0.38      0.33     45000
weighted avg       0.55      0.49      0.41     45000

Matrix Confusion: 
 [[18532  1166   423]
 [16885  3292   507]
 [ 3660   276   259]]
Accuracy: 
 0.49073333333333335
-------Test-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.47      0.92      0.62     40165
     Feature       0.69      0.16      0.26     41329
    Question       0.19      0.06      0.09      8506

    accuracy                           0.49     90000
   macro avg       0.45      0.38      0.32     90000
weighted avg       0.55      0.49   

## Lower Case + Stop Words

### Split Data

In [41]:
label_encoder = LabelEncoder()

X = data_df['text_clean_lowercase_stopwords'].values
y = label_encoder.fit_transform(data_df['label'])

In [42]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=1) # 0.125 x 0.8 = 0.1


### TFIDF

In [43]:
vectorizer = TfidfVectorizer(min_df=1, ngram_range = (1,1), max_features=12000)
train_data_features = vectorizer.fit_transform(X_train)
print(train_data_features.shape)

(315000, 12000)


In [44]:
val_data_features = vectorizer.transform(X_val)
test_data_features = vectorizer.transform(X_test)
vectorizer.get_feature_names()[200:210]



['af',
 'afaict',
 'afaik',
 'affect',
 'affected',
 'affecting',
 'affects',
 'affiliate',
 'affiliated',
 'affiliation']

### Model

#### Logistic Regression

In [45]:
### logistic regression

clf = LogisticRegression(C=1, penalty = 'l2', random_state=0, solver='liblinear', max_iter=1000, multi_class='ovr')

clf.fit(train_data_features, y_train)

print("-------LR-------:")
print("-------C=1-------")
print("-------Validation-------:")
Y_val_pred = clf.predict(val_data_features)
analysis(y_val, Y_val_pred)

print("-------Test-------:")
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)


-------LR-------:
-------C=1-------
-------Validation-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.78      0.83      0.80     20121
     Feature       0.78      0.83      0.80     20684
    Question       0.64      0.22      0.33      4195

    accuracy                           0.77     45000
   macro avg       0.73      0.63      0.65     45000
weighted avg       0.76      0.77      0.76     45000

Matrix Confusion: 
 [[16662  3213   246]
 [ 3198 17199   287]
 [ 1491  1772   932]]
Accuracy: 
 0.7731777777777777
-------Test-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.78      0.83      0.80     40165
     Feature       0.77      0.83      0.80     41329
    Question       0.62      0.21      0.32      8506

    accuracy                           0.77     90000
   macro avg       0.72      0.62      0.64     90000
weighted avg       0.76      0.77      0.75   

#### SVM 

In [46]:
### svm 
clf = SVC(C=1, kernel= 'rbf', max_iter = 1000)
clf.fit(train_data_features, y_train)

print("-------SVM-------:")
print("-------kernel=rbf-------")
print("-------Validation-------:")
Y_val_pred = clf.predict(val_data_features)
analysis(y_val, Y_val_pred)

print("-------Test-------:")
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)




-------SVM-------:
-------kernel=rbf-------
-------Validation-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.57      0.69      0.62     20121
     Feature       0.63      0.51      0.57     20684
    Question       0.20      0.16      0.18      4195

    accuracy                           0.56     45000
   macro avg       0.47      0.46      0.46     45000
weighted avg       0.56      0.56      0.56     45000

Matrix Confusion: 
 [[13980  4774  1367]
 [ 8693 10644  1347]
 [ 2063  1454   678]]
Accuracy: 
 0.5622666666666667
-------Test-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.56      0.69      0.62     40165
     Feature       0.63      0.51      0.57     41329
    Question       0.21      0.17      0.19      8506

    accuracy                           0.56     90000
   macro avg       0.47      0.46      0.46     90000
weighted avg       0.56      0.56     

## Lower Case

### Split Data

In [47]:
label_encoder = LabelEncoder()

X = data_df['text_clean_lowercase'].values
y = label_encoder.fit_transform(data_df['label'])

In [48]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=1) # 0.125 x 0.8 = 0.1


### TFIDF

In [49]:
vectorizer = TfidfVectorizer(min_df=1, ngram_range = (1,1), max_features=12000)
train_data_features = vectorizer.fit_transform(X_train)
print(train_data_features.shape)

(315000, 12000)


In [50]:
val_data_features = vectorizer.transform(X_val)
test_data_features = vectorizer.transform(X_test)
vectorizer.get_feature_names()[200:210]



['afaict',
 'afaik',
 'affect',
 'affected',
 'affecting',
 'affects',
 'affiliate',
 'affiliated',
 'affiliation',
 'affinity']

### Model

#### Logistic Regression

In [51]:
### logistic regression

clf = LogisticRegression(C=1, penalty = 'l2', random_state=0, solver='liblinear', max_iter=1000, multi_class='ovr')

clf.fit(train_data_features, y_train)

print("-------LR-------:")
print("-------C=1-------")
print("-------Validation-------:")
Y_val_pred = clf.predict(val_data_features)
analysis(y_val, Y_val_pred)

print("-------Test-------:")
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)


-------LR-------:
-------C=1-------
-------Validation-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.80      0.84      0.82     20121
     Feature       0.79      0.84      0.81     20684
    Question       0.66      0.29      0.40      4195

    accuracy                           0.79     45000
   macro avg       0.75      0.66      0.68     45000
weighted avg       0.78      0.79      0.78     45000

Matrix Confusion: 
 [[16932  2912   277]
 [ 2998 17338   348]
 [ 1364  1629  1202]]
Accuracy: 
 0.7882666666666667
-------Test-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.79      0.84      0.82     40165
     Feature       0.79      0.83      0.81     41329
    Question       0.65      0.29      0.40      8506

    accuracy                           0.79     90000
   macro avg       0.75      0.66      0.68     90000
weighted avg       0.78      0.79      0.78   

#### SVM 

In [52]:
### svm 
clf = SVC(C=1, kernel= 'rbf', max_iter = 1000)
clf.fit(train_data_features, y_train)

print("-------SVM-------:")
print("-------kernel=rbf-------")
print("-------Validation-------:")
Y_val_pred = clf.predict(val_data_features)
analysis(y_val, Y_val_pred)

print("-------Test-------:")
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)




-------SVM-------:
-------kernel=rbf-------
-------Validation-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.48      0.90      0.63     20121
     Feature       0.82      0.12      0.21     20684
    Question       0.24      0.24      0.24      4195

    accuracy                           0.48     45000
   macro avg       0.51      0.42      0.36     45000
weighted avg       0.61      0.48      0.40     45000

Matrix Confusion: 
 [[18156   383  1582]
 [16494  2546  1644]
 [ 3000   178  1017]]
Accuracy: 
 0.48264444444444443
-------Test-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.48      0.90      0.63     40165
     Feature       0.81      0.12      0.21     41329
    Question       0.23      0.24      0.23      8506

    accuracy                           0.48     90000
   macro avg       0.51      0.42      0.36     90000
weighted avg       0.61      0.48    

## Simple Preprocessing

### Split Data

In [53]:
label_encoder = LabelEncoder()

X = data_df['text_clean'].values
y = label_encoder.fit_transform(data_df['label'])

In [54]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=1) # 0.125 x 0.8 = 0.1


### TFIDF

In [55]:
vectorizer = TfidfVectorizer(min_df=1, ngram_range = (1,1), max_features=12000)
train_data_features = vectorizer.fit_transform(X_train)
print(train_data_features.shape)

(315000, 12000)


In [56]:
val_data_features = vectorizer.transform(X_val)
test_data_features = vectorizer.transform(X_test)
vectorizer.get_feature_names()[200:210]



['afaict',
 'afaik',
 'affect',
 'affected',
 'affecting',
 'affects',
 'affiliate',
 'affiliated',
 'affiliation',
 'affinity']

### Model

#### Logistic Regression

In [57]:
### logistic regression

clf = LogisticRegression(C=1, penalty = 'l2', random_state=0, solver='liblinear', max_iter=1000, multi_class='ovr')

clf.fit(train_data_features, y_train)

print("-------LR-------:")
print("-------C=1-------")
print("-------Validation-------:")
Y_val_pred = clf.predict(val_data_features)
analysis(y_val, Y_val_pred)

print("-------Test-------:")
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)


-------LR-------:
-------C=1-------
-------Validation-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.80      0.84      0.82     20121
     Feature       0.79      0.84      0.81     20684
    Question       0.66      0.29      0.40      4195

    accuracy                           0.79     45000
   macro avg       0.75      0.66      0.68     45000
weighted avg       0.78      0.79      0.78     45000

Matrix Confusion: 
 [[16932  2912   277]
 [ 2998 17338   348]
 [ 1364  1629  1202]]
Accuracy: 
 0.7882666666666667
-------Test-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.79      0.84      0.82     40165
     Feature       0.79      0.83      0.81     41329
    Question       0.65      0.29      0.40      8506

    accuracy                           0.79     90000
   macro avg       0.75      0.66      0.68     90000
weighted avg       0.78      0.79      0.78   

#### SVM 

In [58]:
### svm 
clf = SVC(C=1, kernel= 'rbf', max_iter = 1000)
clf.fit(train_data_features, y_train)

print("-------SVM-------:")
print("-------kernel=rbf-------")
print("-------Validation-------:")
Y_val_pred = clf.predict(val_data_features)
analysis(y_val, Y_val_pred)

print("-------Test-------:")
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)




-------SVM-------:
-------kernel=rbf-------
-------Validation-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.48      0.90      0.63     20121
     Feature       0.82      0.12      0.21     20684
    Question       0.24      0.24      0.24      4195

    accuracy                           0.48     45000
   macro avg       0.51      0.42      0.36     45000
weighted avg       0.61      0.48      0.40     45000

Matrix Confusion: 
 [[18156   383  1582]
 [16494  2546  1644]
 [ 3000   178  1017]]
Accuracy: 
 0.48264444444444443
-------Test-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.48      0.90      0.63     40165
     Feature       0.81      0.12      0.21     41329
    Question       0.23      0.24      0.23      8506

    accuracy                           0.48     90000
   macro avg       0.51      0.42      0.36     90000
weighted avg       0.61      0.48    

## Final Models with more max runs

In [60]:
label_encoder = LabelEncoder()

X = data_df['text_clean_lowercase_stopwords'].values
y = label_encoder.fit_transform(data_df['label'])

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=1) # 0.125 x 0.8 = 0.1
vectorizer = TfidfVectorizer(min_df=1, ngram_range = (1,1), max_features=12000)
train_data_features = vectorizer.fit_transform(X_train)
print(train_data_features.shape)
val_data_features = vectorizer.transform(X_val)
test_data_features = vectorizer.transform(X_test)

#### SVM 

In [65]:
### svm 
clf = SVC(C=1, kernel= 'rbf', max_iter = 10000)
clf.fit(train_data_features, y_train)

print("-------SVM-------:")
print("-------kernel=rbf-------")
print("-------Validation-------:")
Y_val_pred = clf.predict(val_data_features)
analysis(y_val, Y_val_pred)

print("-------Test-------:")
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)




-------SVM-------:
-------kernel=rbf-------
-------Validation-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.66      0.67      0.66     20121
     Feature       0.66      0.66      0.66     20684
    Question       0.32      0.29      0.31      4195

    accuracy                           0.63     45000
   macro avg       0.55      0.54      0.54     45000
weighted avg       0.63      0.63      0.63     45000

Matrix Confusion: 
 [[13566  5494  1061]
 [ 5597 13572  1515]
 [ 1530  1433  1232]]
Accuracy: 
 0.6304444444444445
-------Test-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.65      0.67      0.66     40165
     Feature       0.66      0.65      0.66     41329
    Question       0.31      0.29      0.30      8506

    accuracy                           0.62     90000
   macro avg       0.54      0.54      0.54     90000
weighted avg       0.62      0.62     

In [66]:
label_encoder = LabelEncoder()

X = data_df['text_clean_lowercase'].values
y = label_encoder.fit_transform(data_df['label'])

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=1) # 0.125 x 0.8 = 0.1

vectorizer = TfidfVectorizer(min_df=1, ngram_range = (1,1), max_features=12000)
train_data_features = vectorizer.fit_transform(X_train)

val_data_features = vectorizer.transform(X_val)
test_data_features = vectorizer.transform(X_test)

#### Logistic Regression

In [67]:
### logistic regression

clf = LogisticRegression(C=1, penalty = 'l2', random_state=0, solver='liblinear', max_iter=1000, multi_class='ovr')

clf.fit(train_data_features, y_train)

print("-------LR-------:")
print("-------C=1-------")
print("-------Validation-------:")
Y_val_pred = clf.predict(val_data_features)
analysis(y_val, Y_val_pred)

print("-------Test-------:")
Y_test_pred = clf.predict(test_data_features)
analysis(y_test, Y_test_pred)


-------LR-------:
-------C=1-------
-------Validation-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.80      0.84      0.82     20121
     Feature       0.79      0.84      0.81     20684
    Question       0.66      0.29      0.40      4195

    accuracy                           0.79     45000
   macro avg       0.75      0.66      0.68     45000
weighted avg       0.78      0.79      0.78     45000

Matrix Confusion: 
 [[16932  2912   277]
 [ 2998 17338   348]
 [ 1364  1629  1202]]
Accuracy: 
 0.7882666666666667
-------Test-------:
Report Classification: 
               precision    recall  f1-score   support

         Bug       0.79      0.84      0.82     40165
     Feature       0.79      0.83      0.81     41329
    Question       0.65      0.29      0.40      8506

    accuracy                           0.79     90000
   macro avg       0.75      0.66      0.68     90000
weighted avg       0.78      0.79      0.78   