<a href="https://colab.research.google.com/github/kargaranamir/issue-tagger/blob/main/LSTM_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Data Download

! wget https://machinehack-be.s3.amazonaws.com/predict_github_issues_embold_sponsored_hackathon/Embold_Participant%27s_Dataset.zip -O data.zip
! unzip ./data.zip 
! mv ./Embold_Participant\'s_Dataset ./data
! rm -rf ./data/sample\ submission.csv
! rm -rf ./data/embold_test.json

In [None]:
# install fasttext
! pip install fasttext

### Import Libraries

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf 

from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import GlobalMaxPool1D, MaxPooling1D, GlobalMaxPooling1D, Conv1D
from sklearn.metrics import classification_report, confusion_matrix


from tensorflow.keras.callbacks import EarlyStopping
import fasttext

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import re
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import string

from tqdm.notebook import tqdm
tqdm.pandas()


# Word Embedding
from gensim.models import KeyedVectors


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Load Data

In [4]:
# merge data
data_small_df = pd.read_json('./data/embold_train.json').reset_index(drop=True)
data_large_df = pd.read_json('./data/embold_train_extra.json').reset_index(drop=True)
data_df = data_small_df.append(data_large_df)
data_df['text'] = data_df['title']+' '+data_df['body']
data_df['text_length'] = data_df['text'].apply(lambda text_input: len(text_input.split()))

### Preprocess

In [5]:
stopwords_list = stopwords.words('english')

def clean_text(text, lowercase=True, stop_words=True, links=True, numbers=True):
    text = text.replace("\\r", "")
    if lowercase:
        text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    if links:
        text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    if numbers:
       text = re.sub('\w*\d\w*', '', text)
    if stop_words:
        text = " ".join([word for word in text.split() if word not in stopwords_list])
    return text

In [6]:
data_df['text_clean'] = data_df['text'].progress_apply(lambda text: clean_text(text))

  0%|          | 0/450000 [00:00<?, ?it/s]

In [7]:
label_encoder = LabelEncoder()

X = data_df['text_clean'].values
y = label_encoder.fit_transform(data_df['label'])

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=1) # 0.125 x 0.8 = 0.1


## FastText Embedding

### Download Fasstext Model

In [None]:
! wget https://www.dropbox.com/s/6aaucelizfx7xl6/en_vectors_v3.bin

In [9]:
EMBEDDING_LEN = 128 

### Load FastText Model

In [10]:
model_skipgram = fasttext.load_model('en_vectors_v3.bin')



In [11]:
# Fit Keras Tokenizer on X_train

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=3000)
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size : {}'.format(vocab_size))

Vocabulary Size : 699780


In [12]:
encoded_comments = tokenizer.texts_to_sequences(X_train)

# example of encoded comments
print("Comment : {}".format(X_train[1]))
print("Corresponding Encoding : {}".format(encoded_comments[1]))

Comment : upgrade problem undefined index pkcmspage installation pkcmspage error upgrade mysql php plesk steps reproduce composer require magentoproductcommunityedition noupdate composer update php binmagento setupupgrade expected result upgrade done actual result ssh module magentocustomer module magentobundleimportexport module magentocacheinvalidate module magentoindexer module magentocms upgrading data exception notice undefined index pkcmspage varwwwvhostspepehttpdocs vendormagentoframeworkentitymanagerentitymetadataphp line open url browser error processing request exception printing disabled default security reasons varraport please upgrade database run binmagento setupupgrade magento root directory index status ok ideas edit web setup issue readiness check ok log system upgrade status update application running module magentotheme upgrading data module magentocustomer upgrading data module magentobundleimportexport module magentocacheinvalidate module magentoindexer module mage

In [13]:
# padding
SENT_MAX_LEN = max([len(sent) for sent in encoded_comments])
padded_sequence = pad_sequences(encoded_comments, maxlen=SENT_MAX_LEN, padding='post')
print('Padding Shape: {}'.format(padded_sequence.shape))

Padding Shape: (315000, 499)


In [14]:
# initial embedding matrix
embedding_matrix = np.zeros((vocab_size, EMBEDDING_LEN))

for word, i in tokenizer.word_index.items():
  embedding_vector = model_skipgram.get_word_vector(word)
  # words that cannot be found will be set to 0
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

print(f"Embedding Matrix Shape is: {embedding_matrix.shape}")

Embedding Matrix Shape is: (699780, 128)


In [15]:
# Same procedure with a Unique Tokenizer on Evaluation data

tokenizer.texts_to_matrix(X_val)
eval_encoded_comments = tokenizer.texts_to_sequences(X_val)
eval_padded_sequence = pad_sequences(eval_encoded_comments, maxlen=SENT_MAX_LEN, padding='post')

In [16]:
# Same procedure with a Unique Tokenizer on Test data

tokenizer.texts_to_matrix(X_test)
test_encoded_comments = tokenizer.texts_to_sequences(X_test)
test_padded_sequence = pad_sequences(test_encoded_comments, maxlen=SENT_MAX_LEN, padding='post')

In [17]:
### One Hot Encoding
y_train_hot = np.eye(3)[y_train] # One-Hot
y_val_hot =  np.eye(3)[y_val] # One-Hot
y_test_hot = np.eye(3)[y_test] # One-Hot

In [18]:
import keras.backend as K

def f1_score(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0

    # How many selected items are relevant?
    precision = c1 / c2

    # How many relevant items are selected?
    recall = c1 / c3

    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score


## CNN Model Architecture

In [19]:
## CNN Constants
KERNEL_SIZE = 3
FILTERS = 256

In [20]:
model_2 = Sequential()
model_2.add(Embedding(vocab_size, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=True))
model_2.add(Conv1D(filters=FILTERS, kernel_size=KERNEL_SIZE, activation='relu'))
model_2.add(GlobalMaxPooling1D())
model_2.add(Dense(FILTERS, activation='relu'))
model_2.add(Dense(3, activation='sigmoid'))
model_2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_2.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         89571840  
                                                                 
 conv1d (Conv1D)             (None, None, 256)         98560     
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 256)               65792     
                                                                 
 dense_1 (Dense)             (None, 3)                 771       
                                                                 
Total params: 89,736,963
Trainable params: 89,736,963
Non-trainable params: 0
____________________________________________

### Fit CNN Model
You can run the cell bellow as much as you want. keep track on validation accuracy and also change the `epochs`. 

In [21]:
model_2.fit(
    padded_sequence, 
    y_train_hot, 
    batch_size=32, 
    epochs=1, 
    validation_data=(eval_padded_sequence, y_val_hot)
    )



<keras.callbacks.History at 0x7f49c065ed50>

In [22]:
loss_2, acc_2 = model_2.evaluate(test_padded_sequence, y_test_hot, verbose=0)
print('Test Accuracy: %f' % (acc_2*100))

Test Accuracy: 77.656668


In [23]:
y_pred_2 = model_2.predict(test_padded_sequence).argmax(axis=1)
print(confusion_matrix(y_true=y_test, y_pred=y_pred_2))
print(classification_report(y_true=y_test, y_pred=y_pred_2))

[[32029  7067  1069]
 [ 5147 35366   816]
 [ 2498  3512  2496]]
              precision    recall  f1-score   support

           0       0.81      0.80      0.80     40165
           1       0.77      0.86      0.81     41329
           2       0.57      0.29      0.39      8506

    accuracy                           0.78     90000
   macro avg       0.72      0.65      0.67     90000
weighted avg       0.77      0.78      0.77     90000



## LSTM Model Architecture

In [24]:
# LSTM constants
LSTM_UNITS = 32

In [25]:
model_1 = Sequential()
model_1.add(Embedding(vocab_size, EMBEDDING_LEN, input_length=SENT_MAX_LEN, weights=[embedding_matrix], trainable=True))
model_1.add(Bidirectional(LSTM(EMBEDDING_LEN, return_sequences=True, input_shape=(None, 1))))
model_1.add(Dropout(0.2))
model_1.add(Bidirectional(LSTM(LSTM_UNITS)))
model_1.add(Dropout(0.2))
model_1.add(Dense(EMBEDDING_LEN, activation='relu'))
model_1.add(Dropout(0.1))
model_1.add(Dense(3, activation='sigmoid'))
model_1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 499, 128)          89571840  
                                                                 
 bidirectional (Bidirectiona  (None, 499, 256)         263168    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 499, 256)          0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               73984     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 128)              

### Fit LSTM Model
You can run the cell bellow as much as you want. keep track on validation accuracy and also change the `epochs`. 

In [26]:
model_1.fit(
    padded_sequence, 
    y_train_hot, 
    batch_size=32, 
    epochs=1, 
    validation_data=(eval_padded_sequence, y_val_hot)
    )



<keras.callbacks.History at 0x7f494e331790>

In [27]:
loss_1, acc_1 = model_1.evaluate(test_padded_sequence, y_test_hot, verbose=0)
print(f'Test Accuracy: {acc_1}')

Test Accuracy: 0.785955548286438


In [28]:
y_pred_1 = model_1.predict(test_padded_sequence).argmax(axis=1)
print(confusion_matrix(y_true=y_test, y_pred=y_pred_1))
print(classification_report(y_true=y_test, y_pred=y_pred_1))

[[32796  6562   807]
 [ 4964 35099  1266]
 [ 2826  2839  2841]]
              precision    recall  f1-score   support

           0       0.81      0.82      0.81     40165
           1       0.79      0.85      0.82     41329
           2       0.58      0.33      0.42      8506

    accuracy                           0.79     90000
   macro avg       0.72      0.67      0.68     90000
weighted avg       0.78      0.79      0.78     90000

