# Text multi-label classification using Keras
---

In [1]:
import numpy as np
import pandas as pd

import csv
import re

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sqlite3
from sqlite3 import Error

In [4]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.optimizers import Adam

## SQLite database data to csv conversion
---
Based on [this documentation](https://www.sqlitetutorial.net/sqlite-python/sqlite-python-select/).

In [5]:
SQLITE_FILE = 'database.sqlite'

In [6]:
def create_connection(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return conn

In [7]:
def select_entities(conn, db_name):
    cur = conn.cursor()
    cur.execute(f"SELECT * FROM {db_name}")
    
    columns = np.array([desc[0] for desc in cur.description])
    rows = np.array(cur.fetchall())
    
    # remove b'' characters
    for i in np.arange(rows.shape[0]):
        for j in np.arange(rows.shape[1]):
            if str(rows[i][j]).startswith('b\'') or str(rows[i][j]).startswith('b\"'):
                rows[i][j] = str(rows[i][j])[2:-1]

    return columns, rows

def db_to_csv(db_name):
    with create_connection(SQLITE_FILE) as conn:  # create a database connection
        conn.text_factory = bytes
        
        cols, rows = select_entities(conn, db_name)
        with open(f'ml_class_text/{str.lower(db_name)}.csv', 'w') as f:
            writer = csv.writer(f)
            writer.writerow(cols)
            
            for r in tqdm(rows):
                writer.writerow(r)

In [8]:
with create_connection(SQLITE_FILE) as conn:
    conn.text_factory = bytes
    
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    print(f'Databases: {[db_name[0] for db_name in cursor.fetchall()]}\n')

db_to_csv('Questions')
db_to_csv('Answers')
db_to_csv('Tags')

Databases: [b'Questions', b'sqlite_stat1', b'Answers', b'Tags']



HBox(children=(FloatProgress(value=0.0, max=85085.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=84105.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=244228.0), HTML(value='')))




## Data loading
---

In [9]:
df_questions = pd.read_csv('ml_class_text/questions.csv', encoding='iso-8859-1')
df_questions.head(2)

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,6,5.0,1279567000.0,272,The Two Cultures: statistics vs. machine learn...,"<p>Last year, I read a blog post from <a href=..."
1,21,59.0,1279567000.0,4,Forecasting demographic census,<p>What are some of the ways to forecast demog...


### Tags

In [10]:
df_tags = pd.read_csv('ml_class_text/tags.csv', encoding='iso-8859-1')
df_tags['Id'] = df_tags['Id'].apply(lambda x: x[2:-1]).astype(np.int64)
df_tags['Tag'] = df_tags['Tag'].apply(lambda x: x[2:-1])
df_tags.head()

Unnamed: 0,Id,Tag
0,1,bayesian
1,1,prior
2,1,elicitation
3,2,distributions
4,2,normality


In [11]:
grouped_tags = df_tags.groupby("Tag", sort='count').size().reset_index(name='count')
grouped_tags['Tag'].describe()

count                  1315
unique                 1315
top       quasi-monte-carlo
freq                      1
Name: Tag, dtype: object

### Most common tags

In [12]:
num_classes = 100
grouped_tags = df_tags.groupby('Tag').size().reset_index(name='count')
most_common_tags = grouped_tags.nlargest(num_classes, columns='count')
df_tags['Tag'] = df_tags['Tag'].apply(lambda tag : tag if tag in most_common_tags['Tag'].values else None)
df_tags = df_tags.dropna()
df_tags.head()

Unnamed: 0,Id,Tag
0,1,bayesian
3,2,distributions
7,4,distributions
8,4,statistical-significance
9,6,machine-learning


## Preprocessing
---

### Sanitization of texts

In [13]:
def strip_html_tags(body):
    regex = re.compile('<.*?>')
    return re.sub(regex, '', body)

df_questions['Body'] = df_questions['Body'].apply(strip_html_tags)
df_questions['Text'] = df_questions['Title'] + ' ' + df_questions['Body']

### Denormalization of dataframe adding column with tags

In [14]:
question_tags = df_questions.merge(df_tags, left_on='Id', right_on='Id', how='inner').groupby('Id')['Tag'].apply(np.array)
df_questions = df_questions.merge(pd.DataFrame({'Id': question_tags.index, 'Tags': question_tags.values}), left_on='Id', right_on='Id')

In [15]:
pd.set_option('display.max_colwidth', 400)
df_questions[['Id', 'Text', 'Tags']].head()

Unnamed: 0,Id,Text,Tags
0,6,"The Two Cultures: statistics vs. machine learning? Last year, I read a blog post from Brendan O\'Connor entitled ""Statistics vs. Machine Learning, fight!"" that discussed some of the differences between the two fields. Andrew Gelman responded favorably to this:\n\nSimon Blomberg: \n\n\n From R\'s fortunes\n package: To paraphrase provocatively,\n \'machine learning is statistics minus\n an...",[machine-learning]
1,21,"Forecasting demographic census What are some of the ways to forecast demographic census with some validation and calibration techniques?\n\nSome of the concerns:\n\n\nCensus blocks vary in sizes as rural\nareas are a lot larger than condensed\nurban areas. Is there a need to account for the area size difference?\nif let's say I have census data\ndating back to 4 - 5 census periods,\nhow far ca...",[forecasting]
2,22,Bayesian and frequentist reasoning in plain English How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?\n,[bayesian]
3,31,"What is the meaning of p values and t values in statistical tests? After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests. It seems that students easily learn how to perform the calculations required by a given test but get hung up on interpreting the resul...","[hypothesis-testing, t-test, p-value, interpretation]"
4,36,"Examples for teaching: Correlation does not mean causation There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustrate this point:\n\n\nnumber of storks and birth rate in Denmark;\nnumber of priests in America and alcoholism;\nin the start of the 20th century it was noted that there was a strong correlation between \'...",[correlation]


### Texts tokenization

In [16]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_questions['Tags'])
labels = multilabel_binarizer.classes_

maxlen = 180
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df_questions['Text'])

def get_features(text_series):
    """
    transforms text data to feature_vectors that can be used in the ml model.
    tokenizer must be available.
    """
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)


def prediction_to_label(prediction):
    tag_prob = [(labels[i], prob) for i, prob in enumerate(prediction.tolist())]
    return dict(sorted(tag_prob, key=lambda kv: kv[1], reverse=True))

In [17]:
x = get_features(df_questions['Text'])
y = multilabel_binarizer.transform(df_questions['Tags'])
print(x.shape)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

(76365, 180)


### Dealing with imbalanced classes

In [18]:
most_common_tags['class_weight'] = len(df_tags) / most_common_tags['count']
class_weight = {}
for index, label in enumerate(labels):
    class_weight[index] = most_common_tags[most_common_tags['Tag'] == label]['class_weight'].values[0]
    
most_common_tags.head()

Unnamed: 0,Tag,count,class_weight
986,r,13236,11.552811
1020,regression,10959,13.953189
669,machine-learning,6089,25.112991
1220,time-series,5559,27.507285
946,probability,4217,36.261086


In [20]:
filter_length = 300

model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 180, 20)           100000    
_________________________________________________________________
dropout (Dropout)            (None, 180, 20)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 178, 300)          18300     
_________________________________________________________________
global_max_pooling1d (Global (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 100)               30100     
_________________________________________________________________
activation (Activation)      (None, 100)               0         
Total params: 148,400
Trainable params: 148,400
Non-trainable params: 0
__________________________________________________

In [21]:
callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4), 
    ModelCheckpoint(filepath='ml_class_text/model_conv1d.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    class_weight=class_weight,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 54982 samples, validate on 6110 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


In [22]:
cnn_model = tf.keras.models.load_model('ml_class_text/model_conv1d.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.05505771553188373
categorical_accuracy: 0.37733253836631775
