### Importing Necessary Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import string as s
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
%matplotlib inline
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.metrics  import f1_score, accuracy_score, multilabel_confusion_matrix, confusion_matrix, recall_score, precision_score
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import sklearn
from keras.utils import to_categorical
import itertools

Using TensorFlow backend.


### Data Pre-Processing

In [2]:
train_df = pd.read_json("embold_train.json").reset_index(drop=True)
train_df.head()

Unnamed: 0,title,body,label
0,y-zoom piano roll,a y-zoom on the piano roll would be useful.,1
1,buggy behavior in selection,! screenshot from 2016-02-23 21 27 40 https:/...,0
2,auto update feature,"hi,\r \r great job so far, @saenzramiro ! : \r...",1
3,filter out noisy endpoints in logs,i think we should stop logging requests to:\r ...,1
4,enable pid on / pid off alarm actions for ardu...,expected behavior\r alarm actions pid on and p...,0


In [3]:
test_df = pd.read_json("embold_test.json").reset_index(drop=True)
test_df.head()

Unnamed: 0,title,body
0,config question path-specific environment var...,issue description or question\r \r hey @artemg...
1,crash indien vol,de simulator crasht als hij vol zit
2,unable to mine rocks,"sarkasmo starting today, when i hit enter act..."
3,not all whitelists are processed,create following rules... order of creation is...
4,add ctx menu for idafree 70 and idafree 5,"associated with .dll, .dll_, .exe, .exe_, .sc,..."


In [4]:
train_ex_df = pd.read_json("embold_train_extra.json")
train_ex_df.head()

Unnamed: 0,title,body,label
0,use a 8bit typeface,since this is meant to emulate some old arcade...,1
1,implement wireless m-bus binding,_from chris.pa...@googlemail.com https://cod...,1
2,add multilang support for timeago.js,currently it is only en . \r required to add ...,1
3,scaleway - seg-fault on shutdown,tbr irc creates a new scaleway instance with...,0
4,sistema de pintura: no se guardar los nuevos p...,este sp ya estaba asignado a un carro y se enc...,0


In [5]:
# Mapping news_id with category_id
final_df = pd.concat([train_df, train_ex_df], ignore_index=True)
final_df.head()

Unnamed: 0,title,body,label
0,y-zoom piano roll,a y-zoom on the piano roll would be useful.,1
1,buggy behavior in selection,! screenshot from 2016-02-23 21 27 40 https:/...,0
2,auto update feature,"hi,\r \r great job so far, @saenzramiro ! : \r...",1
3,filter out noisy endpoints in logs,i think we should stop logging requests to:\r ...,1
4,enable pid on / pid off alarm actions for ardu...,expected behavior\r alarm actions pid on and p...,0


In [6]:
# Printing Sample Title
final_df.iloc[7][0]

'proposal  loadtranslation   to lazy load scopewithtranslation'

In [7]:
# Printing Sample Body
final_df.iloc[7][1]

"php\\r public function loadtranslation  \\r {\\r     return $this->load  \\r         'translations' => function  relation $query  {\\r             if  $this->usefallback    {\\r                 $locale = $this->locale  ;\\r                 $countryfallbacklocale = $this->getfallbacklocale $locale ; // e.g. de-de => de\\r                 $locales = array_unique  $locale, $countryfallbacklocale, $this->getfallbacklocale    ;\\r \\r                 return $query->wherein $this->gettranslationstable  .'.'.$this->getlocalekey  , $locales ;\\r             }\\r \\r             return $query->where $this->gettranslationstable  .'.'.$this->getlocalekey  , $this->locale   ;\\r         },\\r       ;\\r }\\r    \\r \\r or maybe you could do this\\r \\r    php\\r public function loadtranslation  \\r {\\r     $query = $this->newquerywithoutrelationships  ->withtranslation  ;\\r     $query->eagerloadrelations  $this  ;\\r \\r     return $this;\\r }\\r"

In [8]:
# Checking for missing snippets/titles/descriptions
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450000 entries, 0 to 449999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   title   450000 non-null  object
 1   body    450000 non-null  object
 2   label   450000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 10.3+ MB


In [9]:
# Check for duplicates
final_df.drop_duplicates(keep='first').count()

title    450000
body     450000
label    450000
dtype: int64

We can observer here, there are no duplicates in the train data

In [10]:
categories = ['Bug','Feature','Question']

In [11]:
# Converting each of title and body into lower case.
final_df['title'] = final_df['title'].apply(lambda title: str(title).lower())
final_df['body'] = final_df['body'].apply(lambda body: str(body).lower())
test_df['title'] = test_df['title'].apply(lambda title: str(title).lower())
test_df['body'] = test_df['body'].apply(lambda body: str(body).lower())

In [12]:
#calculating the length of title and body
final_df['title_len'] = final_df['title'].apply(lambda x: len(str(x).split()))
final_df['body_len'] = final_df['body'].apply(lambda x: len(str(x).split()))

In [13]:
final_df.describe()

Unnamed: 0,label,title_len,body_len
count,450000.0,450000.0,450000.0
mean,0.648267,6.951033,73.072449
std,0.644653,3.269366,85.840179
min,0.0,2.0,2.0
25%,0.0,5.0,21.0
50%,1.0,6.0,44.0
75%,1.0,9.0,90.0
max,2.0,50.0,982.0


As we can see, all articles have a title and body. Going with the intuition that the title is often more descriptive of the category, as well as to provide more text data to the model, we will add title to the body

In [14]:
def fx(x):
    return x['title'] + " " + x['body']   
final_df['text']=final_df.apply(lambda x : fx(x),axis=1)
test_df['text']=test_df.apply(lambda x : fx(x),axis=1)

In [15]:
final_df.head()

Unnamed: 0,title,body,label,title_len,body_len,text
0,y-zoom piano roll,a y-zoom on the piano roll would be useful.,1,3,9,y-zoom piano roll a y-zoom on the piano roll w...
1,buggy behavior in selection,! screenshot from 2016-02-23 21 27 40 https:/...,0,4,9,buggy behavior in selection ! screenshot from ...
2,auto update feature,"hi,\r \r great job so far, @saenzramiro ! : \r...",1,3,32,"auto update feature hi,\r \r great job so far,..."
3,filter out noisy endpoints in logs,i think we should stop logging requests to:\r ...,1,6,17,filter out noisy endpoints in logs i think we ...
4,enable pid on / pid off alarm actions for ardu...,expected behavior\r alarm actions pid on and p...,0,10,281,enable pid on / pid off alarm actions for ardu...


The data is preprocessed, in NLP it is also known as text normalization. Some of the most common methods of text normalization are:
- Tokenization
- Lemmatization
- Stemming

#### Tokenization of Data

In [16]:
def tokenization(text):
    lst=text.split()
    return lst

#### Replace new lines

In [17]:
def remove_new_lines(lst):
    new_lst=[]
    for i in lst:
        i=i.replace(r'\n', ' ').replace(r'\r', ' ').replace(r'\u', ' ')
        new_lst.append(i.strip())
    return new_lst

#### Removal of Punctuation Symbols

In [18]:
def remove_punctuations(lst):
    new_lst=[]
    for i in lst:
        for  j in s.punctuation:
            i=i.replace(j,' ')
        new_lst.append(i.strip())
    return new_lst

#### Removal of Numbers(digits)

In [19]:
def remove_numbers(lst):
    nodig_lst=[]
    new_lst=[]
    for i in  lst:
        for j in  s.digits:
            i=i.replace(j,' ')
        nodig_lst.append(i.strip())
    for i in  nodig_lst:
        if  i!='':
            new_lst.append(i.strip())
    return new_lst

#### Removal of Stopwords

In [20]:
def remove_stopwords(lst):
    stop=stopwords.words('english')
    new_lst=[]
    for i in lst:
        if i not in stop:
            new_lst.append(i.strip())
    return new_lst

#### Lemmatization of Data

In [21]:
lemmatizer=nltk.stem.WordNetLemmatizer()
def lemmatization(lst):
    new_lst=[]
    for i in lst:
        i=lemmatizer.lemmatize(i)
        new_lst.append(i.strip())
    return new_lst

#### Removing URL's

In [22]:
def remove_urls(text):
    return re.sub(r'http\S+', ' ', text)

#### Split words

In [23]:
def split_words(text):
    return ' '.join(text).split()

#### Remove single letter characters

In [24]:
def remove_single_chars(lst):
    new_lst=[]
    for i in lst:
        if len(i)>1:
            new_lst.append(i.strip())
    return new_lst

In [25]:
# Cleaning Text
def denoise_text(text):
    text = remove_urls(text)
    text = tokenization(text)
    text = remove_new_lines(text)
    text = remove_punctuations(text)
    text = remove_numbers(text)
    text = remove_stopwords(text)
    text = split_words(text)
    text = remove_single_chars(text)
    text = lemmatization(text)
    return text

final_df['text'] = final_df['text'].apply(lambda x: denoise_text(x))
test_df['text'] = test_df['text'].apply(lambda x: denoise_text(x))

#### Creating Corpus of Words in Text

In [26]:
# Word Corpus
def get_corpus(text):
    words = []
    for i in text:
        for j in i:
            words.append(j.strip())
    return words
corpus = get_corpus(final_df.text)
corpus[:5]

['zoom', 'piano', 'roll', 'zoom', 'piano']

In [27]:
corpus += get_corpus(test_df.text)

In [28]:
# Most common words
from collections import Counter
counter = Counter(corpus)
most_common = counter.most_common(10)
most_common = dict(most_common)
most_common

{'file': 189039,
 'error': 155620,
 'version': 135163,
 'java': 134569,
 'user': 128590,
 'add': 110787,
 'issue': 102101,
 'test': 97858,
 'line': 94374,
 'use': 91000}

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
def get_top_text_ngrams(corpus, n, g):
    vec = CountVectorizer(ngram_range=(g, g)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

#### Train-Test Split

In [30]:
#label encoding the categories. After this each category would be mapped to an integer.
encoder = LabelEncoder()
final_df['categoryEncoded'] = encoder.fit_transform(final_df['label'])

In [31]:
X_train, X_test, y_train, y_test = train_test_split(final_df['text'], final_df['categoryEncoded'], random_state = 43, test_size = 0.2)

### Model Building

In [32]:
train_x=X_train.apply(lambda x: ''.join(i+' ' for i in x))
test_x=X_test.apply(lambda x: ''.join(i+' '  for i in x))
test_df_final = test_df['text'].apply(lambda x: ''.join(i+' '  for i in x))

## Feature Extraction
 
 Features are extracted from the dataset and TF-IDF(Term Frequency - Inverse Document Frequency) is used for this purpose.

In [33]:
tfidf=TfidfVectorizer(max_features=10000,min_df=6)
train_1=tfidf.fit_transform(train_x)
test_1=tfidf.transform(test_x)
test_2=tfidf.transform(test_df_final)
print("No. of features extracted:", len(tfidf.get_feature_names()))
print(tfidf.get_feature_names()[:20])

train_arr=train_1.toarray()
test_arr=test_1.toarray()

No. of features extracted: 10000
['aa', 'aaa', 'aaaa', 'aab', 'aac', 'aad', 'aar', 'aarch', 'ab', 'aba', 'abandoned', 'abb', 'abbreviation', 'abc', 'abcd', 'abf', 'abi', 'ability', 'able', 'ably']


MemoryError: Unable to allocate 26.8 GiB for an array with shape (360000, 10000) and data type float64

In [None]:
test_arr1=test_2.toarray()

**Function for evaluation of model**

This function finds the F1-score and Accuracy of the trained model

In [None]:
def eval_model(y,y_pred):
    print("Recall score of the model:", round(recall_score(y_test, pred, average='weighted'), 3))
    print("Precision score of the model:", round(precision_score(y_test, pred, average='weighted'), 3))
    print("F1 score of the model:", round(f1_score(y,y_pred,average='micro'), 3))
    print("Accuracy of the model:", round(accuracy_score(y,y_pred),3))
    print("Accuracy of the model in percentage:", round(accuracy_score(y,y_pred)*100,3),"%")

**Function for Displaying the Confusion Matrix**

This function displays the confusion matrix of the model

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [None]:
def confusion_mat(color):
    cm=confusion_matrix(y_test, pred)
    plot_confusion_matrix(cm,
                          categories,
                          title='Confusion matrix')
    

# Training of Model

### Model - Decision Tree Classifier

In [None]:
lgbm=LGBMClassifier()
lgbm.fit(train_arr,y_train)
pred=lgbm.predict(test_arr)

print("first 20 actual labels")
print(y_test.tolist()[:20])
print("first 20 predicted labels")
print(pred.tolist()[:20])

### Evaluation of Results

In [None]:
eval_model(y_test,pred)
b=round(accuracy_score(y_test,pred)*100,3)

In [None]:
confusion_mat('Blues')

In [None]:
pred=lgbm.predict(test_arr1)
#create a submission dataframe
submission_df = pd.DataFrame(pred, columns=['label'])
#write a .csv file for submission
submission_df.to_csv('lgbm_submission.csv', index=False)