# Importing all the necessary libraries

In [None]:
from nltk.corpus import reuters 
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import numpy as np

'''
  https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
  https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
  https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
'''

'\n  https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html\n  https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n  https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\n'

In [None]:
import nltk
nltk.download('reuters')

[nltk_data] Downloading package reuters to /root/nltk_data...


True

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import nltk
nltk.download('punkt')
  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Loading Data

In [None]:
mlb = MultiLabelBinarizer()

In [None]:
def collection_stats():
  documents = reuters.fileids()
  print(str(len(documents)) + " documents");

  train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
  print(str(len(train_docs)) + " total train documents");
 
  test_docs = list(filter(lambda doc: doc.startswith("test"), documents));
  print(str(len(test_docs)) + " total test documents")

  categories = reuters.categories()

  print(str(len(categories)) + " categories");

In [None]:
collection_stats()

10788 documents
7769 total train documents
3019 total test documents
90 categories


# Train Test Split of Data

In [None]:
def train_test_split():
  documents = reuters.fileids()
  train_docs = [document for document in documents if document.startswith("train")]
  test_docs = [document for document in documents if document.startswith("test")]
  x_train = [reuters.raw(doc_id) for doc_id in train_docs]
  y_train = [reuters.raw(doc_id) for doc_id in test_docs]
  x_test = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs])
  y_test = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs])
  return x_train, y_train, x_test, y_test

In [None]:
x_train, x_test, y_train, y_test = train_test_split()

# Data Preprocessing

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(X_train):
  clean =[]
  for term in X_train:
    term=re.sub(r'https\S+' , '' ,term)
    term=re.sub('[^a-zA-Z]' , ' ', term)
    term = str(term).lower()
    term=word_tokenize(term)
    term=[item for item in term if item not in stop_words]
    term =' '.join(term)
    clean.append(term)
  return clean

  

In [None]:
X_train=clean_text(x_train)
X_test =clean_text(x_test)

# Building TF-IDF representation of text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# TFIDF representation of the text using the above imported functiont
vectorizer = TfidfVectorizer()
x_tf_train=vectorizer.fit_transform(X_train)
x_tf_test=vectorizer.transform(X_test)

In [None]:
print(x_tf_train.shape)
print(x_tf_test.shape)

(7769, 24554)
(3019, 24554)


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC

## Fit and Predict Model

In [None]:
lr=LogisticRegression()
ovr = OneVsRestClassifier(lr)


In [None]:
ovr.fit(x_tf_train, y_train )

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [None]:
y_predict = ovr.predict(x_tf_test)

In [None]:
print(y_test[0])
print(y_predict[0])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]


## Classification report on training data

In [None]:
print("Logistic Regression Result word on Train")
print(classification_report( ovr.predict(x_tf_train) , y_train))

Logistic Regression Result word on Train
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1619
           1       0.00      0.00      0.00         0
           2       0.05      0.67      0.10         3
           3       0.35      1.00      0.51        26
           4       0.06      1.00      0.11         3
           5       0.00      0.00      0.00         0
           6       0.45      1.00      0.62        25
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.68      1.00      0.81        75
          10       0.11      1.00      0.19         5
          11       0.00      0.00      0.00         0
          12       0.49      0.99      0.65        89
          13       0.03      1.00      0.05         1
          14       0.00      0.00      0.00         0
          15       0.41      1.00      0.58        28
          16       0.00      0.00      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Classification Report on testing data

In [None]:
print("Logistic Regression Result word on Test")
print(classification_report( ovr.predict(x_tf_test) , y_test))

Logistic Regression Result word on Test
              precision    recall  f1-score   support

           0       0.92      0.99      0.95       670
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.13      1.00      0.24         4
           4       0.06      1.00      0.11         1
           5       0.00      0.00      0.00         0
           6       0.44      1.00      0.62         8
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.64      1.00      0.78        18
          10       0.06      1.00      0.11         1
          11       0.00      0.00      0.00         0
          12       0.45      0.93      0.60        27
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.14      1.00      0.25         4
          16       0.00      0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier

## Fit and Predict Model

In [None]:
mulno = MultinomialNB()
nbClassifier= OneVsRestClassifier(mulno)

In [None]:
nbClassifier.fit(x_tf_train, y_train)

OneVsRestClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                            fit_prior=True),
                    n_jobs=None)

In [None]:
y_pred=nbClassifier.predict(x_tf_test)

In [None]:
print(y_test[0])
print(y_pred[0])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


## Classification Report on training data

In [None]:
print("Naive Bayes Classifier Result word on Train")
print(classification_report(nbClassifier.predict(x_tf_train) , y_train))

Naive Bayes Classifier Result word on Train
              precision    recall  f1-score   support

           0       0.75      1.00      0.86      1246
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.03      1.00      0.05         5
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Classification report on testing data

In [None]:
print("Naive Bayes Classifier Result word on Test")
print(classification_report(nbClassifier.predict(x_tf_test) , y_test))

Naive Bayes Classifier Result word on Test
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       364
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.05      1.00      0.10         3
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
