# Importing Libraries and Preprocessing

In [1]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
import json
%matplotlib inline
import time
import warnings

In [2]:
with open('base_data_preprocessed.json') as obj:
    df = json.load(obj)

In [3]:
df = pd.DataFrame(df).reset_index()
df.drop(['index'], axis = 1,inplace = True)
print(df.head(10))

                                                text     category  category_id
0  !Wowow!\n\n!Wowow! is a collective in Peckham,...         Arts            0
1                         1376 in literature\n\n\n\n         Arts            0
2  C. F. Møller Architects\n\nArkitektfirmaet C. ...         Arts            0
3  Anne Beatts\n\nAnne Beatts (born February 25, ...         Arts            0
4  Norton &amp; Wallis\n\nNorton & Wallis was an ...         Arts            0
5  All the Young Men\n\nAll the Young Men is a 19...         Arts            0
6  Global cascades model\n\nGlobal cascades model...  Mathematics            9
7  Graded-symmetric algebra\n\nIn algebra, given ...  Mathematics            9
8  Hans Eberstark\n\nHans Eberstark (27 January 1...  Mathematics            9
9  Hexacoordinate\n\nHexacoordinate in chemistry ...  Mathematics            9


In [4]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):

    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    text = BAD_SYMBOLS_RE.sub('', text) 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text
    
df['text'] = df['text'].apply(clean_text)

In [5]:
df['text'].apply(lambda x: len(x.split(' '))).sum()

132911725

In [6]:
X = df.text
y = df.category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

# Multinomial Naive Bayes

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

%time
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

CPU times: user 39 µs, sys: 0 ns, total: 39 µs
Wall time: 4.05 µs
accuracy 0.4152521046852123
               precision    recall  f1-score   support

         Arts       0.32      0.38      0.35      3249
     Business       0.48      0.47      0.47      3266
     Concepts       0.22      0.32      0.26      3257
      Culture       0.21      0.17      0.19      3263
    Education       0.38      0.55      0.45      3271
Entertainment       0.35      0.57      0.43      3281
       Events       0.39      0.35      0.37      3255
    Geography       0.63      0.39      0.48      3281
       Health       0.54      0.51      0.52      3269
      History       0.36      0.44      0.39      3346
   Humanities       0.27      0.16      0.20      3135
     Language       0.58      0.34      0.42      3235
          Law       0.42      0.59      0.49      3280
         Life       0.40      0.18      0.25      3124
  Mathematics       0.70      0.76      0.72      3207
       Nature       0.46 

# Support Vector Machine

In [11]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

%time

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

CPU times: user 16 µs, sys: 1e+03 ns, total: 17 µs
Wall time: 3.81 µs
accuracy 0.4548178989751098
               precision    recall  f1-score   support

         Arts       0.34      0.26      0.30      3249
     Business       0.49      0.58      0.53      3266
     Concepts       0.25      0.12      0.16      3257
      Culture       0.21      0.10      0.13      3263
    Education       0.47      0.57      0.52      3271
Entertainment       0.43      0.44      0.44      3281
       Events       0.40      0.42      0.41      3255
    Geography       0.53      0.57      0.55      3281
       Health       0.48      0.67      0.56      3269
      History       0.46      0.48      0.47      3346
   Humanities       0.26      0.13      0.17      3135
     Language       0.48      0.51      0.49      3235
          Law       0.45      0.59      0.51      3280
         Life       0.40      0.30      0.35      3124
  Mathematics       0.52      0.89      0.66      3207
       Nature       0

In [12]:
import eli5
eli5.show_weights(sgd)

NotImplementedError: transform_feature_names not available for TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

%time

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))



# Word to Vector with Logistic Regression

In [38]:
from gensim.models import Word2Vec
wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)

In [39]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [40]:
warnings.simplefilter('ignore')
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train, test = train_test_split(df, test_size=0.3, random_state = 42)

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)



In [41]:
warnings.simplefilter('ignore')
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['category'])
%time
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test.category))
print(classification_report(test.category, y_pred))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 11.2 µs
accuracy 0.3900120336943442
               precision    recall  f1-score   support

         Arts       0.31      0.30      0.30       300
     Business       0.48      0.50      0.49       323
     Concepts       0.27      0.21      0.24       316
      Culture       0.20      0.12      0.15       319
    Education       0.47      0.53      0.50       301
Entertainment       0.34      0.46      0.39       324
       Events       0.32      0.30      0.31       328
    Geography       0.36      0.37      0.37       297
       Health       0.49      0.54      0.52       300
      History       0.37      0.39      0.38       285
   Humanities       0.23      0.15      0.18       313
     Language       0.44      0.40      0.42       332
          Law       0.43      0.52      0.47       320
         Life       0.30      0.24      0.27       294
  Mathematics       0.61      0.77      0.68       297
       Nature       0.36   

# Document to Vector with Logistic Regression

In [44]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import doc2vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re

def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled
X_train, X_test, y_train, y_test = train_test_split(df.category, df.category, random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [45]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 27699/27699 [00:00<00:00, 2750361.65it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3830592.06it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3154354.39it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3460769.33it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3260313.93it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3393347.15it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3375501.96it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3395330.58it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3385436.56it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3195566.80it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3361049.20it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3339696.63it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3476718.53it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3397415.68it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3382085.72it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3412885.24it/s]
100%|██████████| 27699/27699 [00:00<00:00, 3283813.18it/

In [46]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

In [48]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors_dbow, y_train)
logreg = logreg.fit(train_vectors_dbow, y_train)
%time
y_pred = logreg.predict(test_vectors_dbow)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 11.9 µs
accuracy 0.9914560770156438
               precision    recall  f1-score   support

         Arts       0.99      0.98      0.99       284
     Business       1.00      1.00      1.00       298
     Concepts       0.99      0.98      0.99       329
      Culture       0.99      0.99      0.99       306
    Education       0.99      0.99      0.99       310
Entertainment       1.00      0.99      0.99       298
       Events       0.98      0.99      0.98       323
    Geography       0.99      0.99      0.99       315
       Health       1.00      0.99      1.00       315
      History       1.00      1.00      1.00       320
   Humanities       0.99      0.99      0.99       311
     Language       0.99      0.99      0.99       283
          Law       1.00      1.00      1.00       325
         Life       0.99      1.00      0.99       285
  Mathematics       0.99      0.99      0.99       287
       Nature       1.0

# Bag of Words Model with Neural Network

In [None]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

train_posts, train_tags, test_posts, test_tags = train_test_split(X, y, test_size=0.25, random_state=1)


max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_posts) # only fit on train

x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)

encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

batch_size = 32
epochs = 2

# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
              
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)