### IDS 566
### Advanced Text Analytics
### Final Project - The 20 Newsgroup Data Analysis

In [1]:
from __future__ import print_function

import logging
import pandas as pd
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

ModuleNotFoundError: No module named 'pandas'

####  Importing the data

In [2]:
data_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
data_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Evaluating the number of rows in train and test Data

In [3]:
print("%d documents" % len(data_train.filenames))
print("%d categories" % len(data_train.target_names))

print("%d documents" % len(data_test.filenames))
print("%d categories" % len(data_test.target_names))

11314 documents
20 categories
7532 documents
20 categories


In [4]:
# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

## Label Distribution in Training
ls = pd.DataFrame(y_train, columns =['Names'])
ls['Names'].value_counts(normalize=True)

10    0.053032
15    0.052943
8     0.052855
9     0.052766
11    0.052590
13    0.052501
7     0.052501
14    0.052413
5     0.052413
12    0.052236
2     0.052236
3     0.052148
6     0.051706
1     0.051617
4     0.051087
17    0.049850
16    0.048259
0     0.042425
18    0.041100
19    0.033322
Name: Names, dtype: float64

### Cleaning the data to remove numeric and alphanumeric strings

In [5]:
import re
train_text = []
test_text = []
for text in data_train.data:
    train_text.append(re.sub(r'[^a-zA-Z\s]+','',text))
for text1 in data_test.data:
    test_text.append(re.sub(r'[^a-zA-Z\s]+','',text1))

In [6]:
data_train["data"] = train_text
data_test["data"] = test_text

### Stemming the words to get the root word

In [7]:
def stem_words(text):
    stemmer = SnowballStemmer('english')
    return ' '.join([stemmer.stem(word) for word in text.split(' ')])

def lemma_words(text):
    lemmer=WordNetLemmatizer()
    return ' '.join([lemmer.lemmatize(word) for word in text.split(' ')])

Processing for training data

In [8]:
stemmed_text = []
lemmed_text = []
for text in data_train.data:
    stemmed_text.append(stem_words(text))
    lemmed_text.append(lemma_words(text))

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - '/home/root-user/nltk_data'
    - '/home/root-user/anaconda3/nltk_data'
    - '/home/root-user/anaconda3/share/nltk_data'
    - '/home/root-user/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
data_train["stemmed_data"] = stemmed_text
data_train["lemmed_data"] = lemmed_text

Processing for test data

In [None]:
stemmed_text = []
lemmed_text = []
for text in data_test.data:
    stemmed_text.append(stem_words(text))
    lemmed_text.append(lemma_words(text))

In [None]:
data_test["stemmed_data"] = stemmed_text
data_test["lemmed_data"] = lemmed_text

##### Creating count Vectorizer

In [None]:
vectorizer = count_vect = CountVectorizer(stop_words='english')
X_train_counts = count_vect.fit_transform(data_train.stemmed_data)
X_test_counts = count_vect.transform(data_test.stemmed_data)
print(X_train_counts.shape)
print(X_test_counts.shape)

##### Creating TFIDF Vectorizer

In [None]:
vectorizer_tfidf = TfidfVectorizer(sublinear_tf=True, max_df = 1,stop_words='english')
X_train_tfidf = vectorizer_tfidf.fit_transform(data_train.stemmed_data)
X_test_tfidf = vectorizer_tfidf.transform(data_train.stemmed_data)
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

In [None]:
### Getting the total feature vextor length i.e. vocabulary size
feature_names_tfidf = vectorizer_tfidf.get_feature_names()
len(feature_names_tfidf)

## Model 1 - Naive Bayes

Naive bayes with CountVectorizer and unstemmed data

In [None]:
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB().fit(X_train_counts, data_train.target)

In [None]:
NB_predicted_train = NB.predict(X_train_counts)
NB_predicted = NB.predict(X_test_counts)

print("Naive Bayes - No Tuning - No Stemming")
print("Training Accuracy:", np.mean(NB_predicted_train == y_train))
print("Training Accuracy:", np.mean(NB_predicted == y_test))

In [None]:
# import nltk
# nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

In [None]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

Naive Bayes using pipeling with stemmed data

In [None]:
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(data_train.data, y_train)

predicted_mnb_stemmed = text_mnb_stemmed.predict(data_test.data)
predicted_mnb_stemmed_train = text_mnb_stemmed.predict(data_train.data)

In [None]:
print("Naive Bayes -No Tuning - Stemming")
print("Training Accuracy:", np.mean(predicted_mnb_stemmed_train == y_train))
print("Test Accuracy:",np.mean(predicted_mnb_stemmed == y_test))


We observe that the model with stemming and tfidf performes better. For all the models going forwards therefore we used stemmed data and tfidf vectorizer

#### Parameter Tunning - Naive Bayes - using the GridSearch without Stemming

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-1, 1e-2, 1e-3,1e-4), }

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(data_train.data, y_train)

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(data_train.data, data_train.target)

In [None]:
gs_clf.best_score_
gs_clf.best_params_

In [None]:
predicted_gs_clf = gs_clf.predict(data_test.data)
predicted_gs_clf_train = gs_clf.predict(data_train.data)

In [None]:
print("Naive Bayes -With Tuning - Without Stemming")
print("Training Accuracy:",np.mean(predicted_gs_clf == y_test))
print("Test Accuracy:",np.mean(predicted_gs_clf_train == y_train))

#### Parameter Tunning - Naive Bayes - using the GridSearch with Stemming

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-1, 1e-2, 1e-3,1e-4), }

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(data_train.stemmed_data, y_train)

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(data_train.stemmed_data, data_train.target)

In [None]:
### Finding the best score and best parameters
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
#### Prodiction on test and train data
predicted_gs_clf_train = gs_clf.predict(data_train.stemmed_data)
predicted_gs_clf = gs_clf.predict(data_test.stemmed_data)

In [None]:
print("Naive Bayes -With Tuning - Stemming")
print("Training Accuracy:",np.mean(predicted_gs_clf_train == y_train))
print("Test Accuracy:",np.mean(predicted_gs_clf == y_test))


## Model - 2 Support Vector Machines

SVM model with without stemming

In [None]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(data_train.stemmed_data, data_train.target)
### Predition on test and train data
predicted_svm_train = text_clf_svm.predict(data_train.stemmed_data)
predicted_svm = text_clf_svm.predict(data_test.stemmed_data)


In [None]:

print("Support Vector Machine -No Tuning - No Stemming")
print("Training Accuracy:",np.mean(predicted_svm_train == data_train.target))
print("Test Accuracy:", np.mean(predicted_svm == data_test.target))

#### Parameter Tuning Using SVM - Without Stemming

In [None]:
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(data_train.data, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV

parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf-svm__alpha': (1e-1, 1e-2, 1e-3,1e-4 ), }

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(data_train.data, data_train.target)

### Finding the best score and best parameters
gs_clf_svm.best_score_
gs_clf_svm.best_params_

In [None]:
### Predition on test and train data
glf_predicted_svm_train = gs_clf_svm.predict(data_train.data)
glf_predicted_svm = gs_clf_svm.predict(data_test.data)


In [None]:
print("Support Vector Machine -With Tuning - No Stemming")
print("Training Accuracy:",np.mean(glf_predicted_svm_train == data_train.target))
print("Test Accuracy:",np.mean(glf_predicted_svm == data_test.target))

#### Parameter Tuning Using SVM - Using Stemming

In [None]:
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(data_train.stemmed_data, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV

parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf-svm__alpha': (1e-1, 1e-2, 1e-3,1e-4 ), }

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(data_train.stemmed_data, data_train.target)

### Finding the best score and best parameters
gs_clf_svm.best_score_
gs_clf_svm.best_params_

In [None]:
### Prediction on train and test data
glf_predicted_svm_train = gs_clf_svm.predict(data_train.stemmed_data)
glf_predicted_svm = gs_clf_svm.predict(data_test.stemmed_data)

In [None]:
print("Support Vector Machine -With Tuning - with Stemming")
print("Training Accuracy:",np.mean(glf_predicted_svm_train == data_train.target))
print("Training Accuracy:",np.mean(glf_predicted_svm == data_test.target))

## Model 3  - Logistic Regression with Regularization - Ridge and Lasso

#### Parameter Tuning without Stemming

In [None]:
count_vect = CountVectorizer(stop_words='english',preprocessor=None)
vectorizer_tfidf = TfidfTransformer()

model = Pipeline([("CVectorizer",count_vect),("TF_IDF",vectorizer_tfidf),("SGDC",SGDClassifier(loss='log',max_iter=1000,tol=1e-3))])
parameters = {
    'CVectorizer__max_df':(0.1,),
    'CVectorizer__ngram_range': [(1, 1), (1, 2)],
    'SGDC__penalty':('l1','l2'),
    'SGDC__alpha':(0.1,0.01,0.0001,1e-5,1e-10)
}

In [None]:
GSCV = GridSearchCV(model, parameters,n_jobs=-1)
GSCV.fit(data_train.data,data_train.target)


In [None]:
GSCV.best_score_
GSCV.best_params_

In [None]:
preds_train = GSCV.predict(data_train.data)
preds = GSCV.predict(data_test.data)

In [None]:
print(np.mean(preds_train == data_train.target))
print(np.mean(preds == data_test.target))

#### Parameter Tuning with Stemming

In [None]:
GSCV = GridSearchCV(model, parameters,n_jobs=-1)
GSCV.fit(data_train.stemmed_data ,data_train.target)

In [None]:
GSCV.best_score_
GSCV.best_params_

In [None]:
preds_train = GSCV.predict(data_train.stemmed_data)
preds = GSCV.predict(data_test.stemmed_data)

In [None]:
print(np.mean(preds_train == data_train.target))
print(np.mean(preds == data_test.target))