# Importing necessory libraries

In [None]:
from nltk.corpus import reuters
from sklearn.preprocessing import MultiLabelBinarizer
import spacy
import numpy as np

'''
  References:
  https://spacy.io/usage/vectors-similarity
  https://spacy.io/usage/linguistic-features#named-entities
  https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
  https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
  https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
'''

'\n  References:\n  https://spacy.io/usage/vectors-similarity\n  https://spacy.io/usage/linguistic-features#named-entities\n  https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html\n  https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html\n  https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n'

In [None]:
import nltk
nltk.download('reuters')
!python -m spacy download en_core_web_lg

[nltk_data] Downloading package reuters to /root/nltk_data...
Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.1MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp36-none-any.whl size=829180945 sha256=c22e4ab9a2013e08f7c69d7a8c9b63306746818ce0d2e87bcbd748c2778de03c
  Stored in directory: /tmp/pip-ephem-wheel-cache-3a57hhp0/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


# Loading Data 

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
mlb = MultiLabelBinarizer()

In [None]:
def collection_stats():
  documents = reuters.fileids()
  print(str(len(documents)) + " documents");

  train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
  print(str(len(train_docs)) + " total train documents");
 
  test_docs = list(filter(lambda doc: doc.startswith("test"), documents));
  print(str(len(test_docs)) + " total test documents")

  categories = reuters.categories()

  print(str(len(categories)) + " categories");

In [None]:
collection_stats()

10788 documents
7769 total train documents
3019 total test documents
90 categories


# Train Test Split of data

In [None]:
def train_test_split():
  documents = reuters.fileids()
  train_docs = [document for document in documents if document.startswith("train")]
  test_docs = [document for document in documents if document.startswith("test")]
  x_train = [reuters.raw(doc_id) for doc_id in train_docs]
  y_train = [reuters.raw(doc_id) for doc_id in test_docs]
  x_test = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs])
  y_test = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs])
  return x_train, y_train, x_test, y_test

In [None]:
x_train, x_test, y_train, y_test = train_test_split()

# Functions for getting entities and word vectors

In [None]:
def get_entities(text):
  tokens=nlp(text)
  entities= [token.text for token in tokens.ents]
  return entities

In [None]:
def get_word_vectors(sentence):
  tokens = nlp(sentence)
  vector=np.sum([token.vector for token in tokens] ,axis =0)
  if vector.shape==():
    vector=np.random.rand(300,)
  return vector

## Build vector representations for train data

In [None]:
# Extract entities and build vector representations for train data
entities=get_entities(x_train[0])
get_word_vectors(' '.join(entities)).shape

(300,)

In [None]:
x_net_train=[get_word_vectors(' '.join(get_entities(doc))) for doc in x_train]

In [None]:
np.shape(x_net_train)

(7769, 300)

## Build vector representations for test data

In [None]:
# Extract entities and build vector representations for test data
entities=get_entities(x_test[0])
get_word_vectors(' '.join(entities)).shape

(300,)

In [None]:
x_net_test=[get_word_vectors(' '.join(get_entities(doc))) for doc in x_test]

In [None]:
np.shape(x_net_test)

(3019, 300)

# Naive bayes classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier

## Fit and predict using Naive bayes classifier

In [None]:
nb=OneVsRestClassifier(GaussianNB())
nb.fit(x_net_train,y_train)

OneVsRestClassifier(estimator=GaussianNB(priors=None, var_smoothing=1e-09),
                    n_jobs=None)

In [None]:
y_pred1=nb.predict(x_net_test)

In [None]:
y_pred1[0]

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0])

## Print classification report

In [None]:
print(classification_report(y_true=y_test, y_pred=y_pred1))

              precision    recall  f1-score   support

           0       0.34      0.89      0.49       719
           1       0.01      0.65      0.02        23
           2       0.05      0.36      0.09        14
           3       0.04      0.43      0.08        30
           4       0.04      0.83      0.08        18
           5       0.00      0.00      0.00         1
           6       0.05      0.78      0.10        18
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.05      0.50      0.09        28
          10       0.02      0.61      0.04        18
          11       0.00      0.00      0.00         1
          12       0.05      0.34      0.08        56
          13       0.04      0.35      0.07        20
          14       0.00      0.00      0.00         2
          15       0.19      0.64      0.29        28
          16       0.00      0.00      0.00         1
          17       0.15    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

## Fit and predict using Logistic Regression classifier

In [None]:
lr=OneVsRestClassifier(LogisticRegression(n_jobs=5))
lr.fit(x_net_train,y_train)


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto', n_jobs=5,
                                                 penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [None]:
y_pred2=lr.predict(x_net_test)

## Print classification report

In [None]:
print(classification_report(y_true=y_test, y_pred=y_pred2))

              precision    recall  f1-score   support

           0       0.88      0.79      0.83       719
           1       0.33      0.13      0.19        23
           2       0.43      0.43      0.43        14
           3       0.26      0.23      0.25        30
           4       0.09      0.11      0.10        18
           5       0.00      0.00      0.00         1
           6       0.50      0.39      0.44        18
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.53      0.61      0.57        28
          10       0.27      0.33      0.30        18
          11       0.00      0.00      0.00         1
          12       0.44      0.54      0.48        56
          13       0.30      0.30      0.30        20
          14       0.00      0.00      0.00         2
          15       0.33      0.29      0.31        28
          16       0.00      0.00      0.00         1
          17       0.67    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
