# Auto-tagging system based on LSTM

In [1]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

In [2]:
import sys
sys.path.append('..')
from utils.s3_class import S3Functions

s3_funcs = S3Functions(bucket_name='jdgallegoq-autotagging')

In [3]:
# load data
questions_df = pd.read_csv(
    s3_funcs.read_object(key='Questions.csv'),
    encoding='latin-1',
    encoding_errors='ignore'
)
tags_df = pd.read_csv(s3_funcs.read_object('Tags.csv'))

In [4]:
display(questions_df.head())
display(tags_df.head())

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,6,5.0,2010-07-19T19:14:44Z,272,The Two Cultures: statistics vs. machine learn...,"<p>Last year, I read a blog post from <a href=..."
1,21,59.0,2010-07-19T19:24:36Z,4,Forecasting demographic census,<p>What are some of the ways to forecast demog...
2,22,66.0,2010-07-19T19:25:39Z,208,Bayesian and frequentist reasoning in plain En...,<p>How would you describe in plain English the...
3,31,13.0,2010-07-19T19:28:44Z,138,What is the meaning of p values and t values i...,<p>After taking a statistics course and then t...
4,36,8.0,2010-07-19T19:31:47Z,58,Examples for teaching: Correlation does not me...,"<p>There is an old saying: ""Correlation does n..."


Unnamed: 0,Id,Tag
0,1,bayesian
1,1,prior
2,1,elicitation
3,2,distributions
4,2,normality


# Text Cleaning for Question Body

In [5]:
def cleaner(text:str=None):
    # remove html tags
    text = BeautifulSoup(text).get_text()
    # converto to lower
    text = text.lower()
    # fetch only alphabetic characters
    text = re.sub(r'[^a-z]', ' ', text)
    # split into tokens to remove whitespaces
    tokens = text.split()

    return " ".join(tokens)

In [6]:
# clean text o question df
questions_df['cleaned_text'] = questions_df['Body'].apply(cleaner)

# Explore Tags

In [7]:
tags_df['Tag'].value_counts()

r                       13236
regression              10959
machine-learning         6089
time-series              5559
probability              4217
                        ...  
fmincon                     1
doc2vec                     1
sympy                       1
adversarial-boosting        1
corpus-linguistics          1
Name: Tag, Length: 1315, dtype: int64

In [8]:
# group tags by id
tags_df['Tag'] = tags_df['Tag'].apply(lambda x: re.sub('-', '', x))
tags_df = tags_df.groupby('Id').apply(lambda x: x['Tag'].values).reset_index(name='tags')
tags_df.head()

Unnamed: 0,Id,tags
0,1,"[bayesian, prior, elicitation]"
1,2,"[distributions, normality]"
2,3,"[software, opensource]"
3,4,"[distributions, statisticalsignificance]"
4,6,[machinelearning]


In [9]:
# merge with qiestions
df = pd.merge(
    tags_df,
    questions_df,
    on='Id',
    how='inner'
)[['Id', 'Body', 'cleaned_text', 'tags']]
df.head()

Unnamed: 0,Id,Body,cleaned_text,tags
0,1,<p>How should I elicit prior distributions fro...,how should i elicit prior distributions from e...,"[bayesian, prior, elicitation]"
1,2,<p>In many different statistical methods there...,in many different statistical methods there is...,"[distributions, normality]"
2,3,<p>What are some valuable Statistical Analysis...,what are some valuable statistical analysis op...,"[software, opensource]"
3,4,<p>I have two groups of data. Each with a dif...,i have two groups of data each with a differen...,"[distributions, statisticalsignificance]"
4,6,"<p>Last year, I read a blog post from <a href=...",last year i read a blog post from brendan o co...,[machinelearning]


# Dataset preparation
Only use data for most frequent tags. Let's say top 10, top 20...

In [10]:
from collections import Counter

In [11]:
# lets get a top n
n = 10
top_10 = Counter([v for row in df.tags.values for v in row]).most_common(n)
# convert into a dict
top_10 = dict(top_10)

In [12]:
# now get X and y based on those most common tags
x = []
y = []
for i in range(df['tags'].shape[0]):
    temp = []
    for j in df['tags'][i]:
        if j in top_10.keys():
            temp.append(j)
    
    if (len(temp)>1):
        x.append(df['cleaned_text'][i])
        y.append(temp)

In [13]:
len(x), len(y)

(11106, 11106)

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer # create multilabel
mlb = MultiLabelBinarizer()

y = mlb.fit_transform(y)
y.shape

(11106, 10)

In [15]:
from sklearn.model_selection import train_test_split # split data

X_train, X_val, y_train, y_val = train_test_split(
    x,
    y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# Text Representation

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [17]:
# preprare tokenizer
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(X_train)

In [18]:
# how many tokens do we have in our corpus
len(x_tokenizer.word_index)

25069

In [19]:
# if want to filter unknown tokens
threshold = 3 # at least appears 3 times in all corpus
c = 0
for key, value in x_tokenizer.word_counts.items():
    if value>=threshold:
        c += 1
print(c)

12570


In [20]:
# define again tokenizer with filtering
x_tokenizer = Tokenizer(
    num_words=c,
    oov_token='unk' # define value for unknown tokens
)
x_tokenizer.fit_on_texts(X_train)

In [21]:
# pad sequences: guarantee that all sequences are going to have
# the same lenght; ex: define all texts to be 100 tokens lenght.
max_len = 100

X_train_seq = x_tokenizer.texts_to_sequences(X_train)
X_val_seq = x_tokenizer.texts_to_sequences(X_val)

# pad adding zeros
X_train_seq = pad_sequences(X_train_seq, padding='post', maxlen=max_len)
X_val_seq = pad_sequences(X_val_seq, padding='post', maxlen=max_len)

# Model Architecture

In [22]:
from keras.models import Sequential
from keras.layers import (
    Dense,
    Embedding,
    GRU,

)
from keras.callbacks import ModelCheckpoint

In [23]:
# get vocabulary size
x_voc_size = x_tokenizer.num_words + 1

In [24]:
model = Sequential()
model.add(Embedding(x_voc_size, 50, input_shape=(max_len,), mask_zero=True))
model.add(GRU(128,))
model.add(Dense(128, activation='relu'))
model.add(Dense(10, activation='sigmoid')) # 10 is the number of classes to predict

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 50)           628550    
                                                                 
 gru (GRU)                   (None, 128)               69120     
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dense_1 (Dense)             (None, 10)                1290      
                                                                 
Total params: 715472 (2.73 MB)
Trainable params: 715472 (2.73 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
# compile
model.compile(
    optimizer='Adam',
    loss='binary_crossentropy'
)

# model callbacks
callbacks = ModelCheckpoint(
    "weights.best.hdf5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    mode='min'
)

In [26]:
# train
model_history = model.fit(
    X_train_seq,
    y_train,
    batch_size=128,
    epochs=10,
    verbose=1,
    validation_data=(X_val_seq, y_val),
    callbacks=[callbacks]
)

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.47843, saving model to weights.best.hdf5
Epoch 2/10
 1/70 [..............................] - ETA: 7s - loss: 0.4711

  saving_api.save_model(


Epoch 2: val_loss improved from 0.47843 to 0.46887, saving model to weights.best.hdf5
Epoch 3/10
Epoch 3: val_loss improved from 0.46887 to 0.41516, saving model to weights.best.hdf5
Epoch 4/10
Epoch 4: val_loss improved from 0.41516 to 0.40564, saving model to weights.best.hdf5
Epoch 5/10
Epoch 5: val_loss improved from 0.40564 to 0.39237, saving model to weights.best.hdf5
Epoch 6/10
Epoch 6: val_loss improved from 0.39237 to 0.35978, saving model to weights.best.hdf5
Epoch 7/10
Epoch 7: val_loss improved from 0.35978 to 0.34561, saving model to weights.best.hdf5
Epoch 8/10
Epoch 8: val_loss improved from 0.34561 to 0.34543, saving model to weights.best.hdf5
Epoch 9/10
Epoch 9: val_loss did not improve from 0.34543
Epoch 10/10
Epoch 10: val_loss did not improve from 0.34543


# Model Eval

In [27]:
model.load_weights("weights.best.hdf5")

# predict
pred_prob = model.predict(X_val_seq)
pred_prob[0] # it displays 10 probas



array([0.23657246, 0.17965588, 0.16786788, 0.39927116, 0.21967882,
       0.18755539, 0.39806244, 0.47501236, 0.13913867, 0.02432637],
      dtype=float32)

In [28]:
# get best threshold value
thres = np.arange(0, 0.5, 0.01)

In [29]:
# convert probas to classes based on a threshold value
def classify(pred_prob, thres):
    y_pred_seq = []
    for i in pred_prob:
        temp = []
        for j in i:
            if j>thres:
                temp.append(1)
            else:
                temp.append(0)
        y_pred_seq.append(temp)

    return y_pred_seq

In [30]:
from sklearn import metrics

In [31]:
score = []
# convert to 1-d array
y_true = np.array(y_val).ravel()
for t in thres:
    y_pred_seq = classify(pred_prob, t)
    y_pred = np.array(y_pred_seq).ravel()
    score.append(metrics.f1_score(y_true, y_pred))

# find optimum
opt = thres[score.index(max(score))]
opt

0.33

In [32]:
# now based on optimum from above then get preds
y_pred_seq = classify(pred_prob, opt)
y_pred = np.array(y_pred_seq).ravel()
print(metrics.classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.88      0.90     17512
           1       0.61      0.70      0.65      4708

    accuracy                           0.84     22220
   macro avg       0.76      0.79      0.78     22220
weighted avg       0.85      0.84      0.85     22220



In [33]:
# predict labels
y_pred = mlb.inverse_transform(np.array(y_pred_seq))
y_true = mlb.inverse_transform(np.array(y_val))

df_out = pd.DataFrame({
    "comment": X_val,
    "actual": y_true,
    "prediction": y_pred
})
display(df_out.head(10))

Unnamed: 0,comment,actual,prediction
0,i would like to test if subjects are significa...,"(hypothesistesting, logistic)","(logistic, regression)"
1,could somebody explain this difference just as...,"(classification, machinelearning)","(regression, selfstudy)"
2,lets say you want to create a random forest mo...,"(classification, regression)","(classification, machinelearning)"
3,i am trying to estimate a logit regression mod...,"(logistic, r, regression)","(logistic, regression)"
4,i want to calculate the posterior distribution...,"(distributions, selfstudy)","(distributions, probability, selfstudy)"
5,i have a farm with years data about the area a...,"(distributions, timeseries)","(machinelearning, r, regression)"
6,in the book of hosmer lemeshow it is stated in...,"(logistic, regression)","(logistic, regression)"
7,i find this counter intuitive first i chose ra...,"(machinelearning, regression)","(classification, logistic, machinelearning, re..."
8,i have a list of sites and a list of survival ...,"(r, regression)","(probability, selfstudy)"
9,i want to assess the effect of temperature on ...,"(regression, timeseries)","(r, regression, timeseries)"


In [52]:
# now define a function that does all above
# predict tags based on a comment
def predict_tag(comment):
    text = []
    # preprocess
    text = [cleaner(text)]
    # convert to integer sequences
    seq = x_tokenizer.texts_to_sequences(text)
    # pad
    pad_seq = pad_sequences(seq, padding='post', maxlen=max_len)
    # make preds
    pred_prob = model.predict(pad_seq)
    classes = classify(pred_prob, opt)[0]
    classes = np.array(classes)
    classes = mlb.inverse_transform(classes)

    return classes