In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
import string
import spacy
from collections import Counter

from nltk.corpus import gutenberg
import re
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

nlp = spacy.load('en')

In [2]:
def text_cleaner(text):
    
    for i in range(len(text)):
        if text[i] == '\n':
            if text[i + 1] == '\n':
                return text[(i + 2):]

def bow_features(data, common_words):
    
    bow = []
    bow.append(list(data.iloc[:, 0]))
    bow.append(list(data.iloc[:, 1]))
    
    for i in range(len(common_words)):
        bow.append(list(np.zeros(len(bow[0]))))
    
    for i, text in enumerate(bow[0]):
        
        for word in text:
            for word2 in range(len(common_words)):
                if word == common_words[word2]:
                    bow[word2 + 2][i] += 1
    
    return bow

In [3]:
root_dir = '/Data Science/20_newsgroups'

texts = []
category = []

for filename in os.listdir(root_dir):
    if filename != '.DS_Store':
        for filename2 in os.listdir(root_dir + '/' + filename):
            try:
                x = open(root_dir + '/' + filename + '/' + filename2)
                raw = x.read()
                texts.append(text_cleaner(raw))
                category.append(filename)

            except:
                None

new_texts = []
for i in range(len(texts)):
    
    text = texts[i]
    text = text.split()
    text = ' '.join(text)
    text = text.lower()
    text = nlp(text)
    allwords = [token.lemma_
                for token in text
                if not token.is_stop
                and not token.is_punct
               ]
    
    badwords = ['write', 'article', 'know', 'like', 'think', 'thank']
    clean_words = []
    for word in allwords:
        if (len(word) > 2) and (len(word) < 15) and word.isalpha() and (word not in badwords):
            clean_words.append(word)
    new_texts.append(clean_words)
        
data = pd.DataFrame()
data['Text'] = new_texts
data['Category'] = category

In [None]:
num_common_words = 100
common_text = []
for text in data.Text:
    for word in text:
        common_text.append(word)
common_words = [item[0] for item in Counter(common_text).most_common(num_common_words)]

word_counts = bow_features(data, common_words)
for feature in range(num_common_words):
    data[common_words[feature]] = word_counts[feature + 2]

In [None]:
X = data.iloc[:, 2:]
Y = data['Category']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print(num_common_words, 'Words used in Bag of Words')

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
print('Random Forest Score:', rfc.score(X_test, y_test))

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('KNN 3 neighbors:', knn.score(X_test, y_test))

knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('KNN 13 neighbors:', knn.score(X_test, y_test))

knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('KNN 21 neighbors:', knn.score(X_test, y_test))

In [None]:
data = pd.DataFrame()
data['Text'] = new_texts
data['Category'] = category

num_common_words = 1000
common_text = []
for text in data.Text:
    for word in text:
        common_text.append(word)
common_words = [item[0] for item in Counter(common_text).most_common(num_common_words)]

word_counts = bow_features(data, common_words)
for feature in range(num_common_words):
    data[common_words[feature]] = word_counts[feature + 2]
    data[common_words[feature]] = (data[common_words[feature]] - min(data[common_words[feature]]))/max(data[common_words[feature]])
    
X = data.iloc[:, 2:]
Y = data['Category']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print(num_common_words, 'Words used in Bag of Words')

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
print('Random Forest Score:', rfc.score(X_test, y_test))

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('KNN 3 neighbors:', knn.score(X_test, y_test))

knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('KNN 13 neighbors:', knn.score(X_test, y_test))

knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('KNN 21 neighbors:', knn.score(X_test, y_test))

In [None]:
data = pd.DataFrame()
data['Text'] = new_texts
data['Category'] = category

num_common_words = 10000
common_text = []
for text in data.Text:
    for word in text:
        common_text.append(word)
common_words = [item[0] for item in Counter(common_text).most_common(num_common_words)]

word_counts = bow_features(data, common_words)
for feature in range(num_common_words):
    data[common_words[feature]] = word_counts[feature + 2]
    data[common_words[feature]] = (data[common_words[feature]] - min(data[common_words[feature]]))/max(data[common_words[feature]])

    
X = data.iloc[:, 2:]
Y = data['Category']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print(num_common_words, 'Words used in Bag of Words')

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
print('Random Forest Score:', rfc.score(X_test, y_test))

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('KNN 3 neighbors:', knn.score(X_test, y_test))

knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('KNN 13 neighbors:', knn.score(X_test, y_test))

knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('KNN 21 neighbors:', knn.score(X_test, y_test))

In [None]:
# KNN overfits with larger set of common words. 2000 words is best, and less neighbors seems to be better as well
# We would need to normalize and maybe use tfidf instead of BOW to get better results

In [None]:
# Unsupervised Summarization:

In [4]:
new_texts_combined = []
for i in new_texts:
    combined_text = ''
    for j in i:
        combined_text += j
        combined_text += ' '
    new_texts_combined.append(combined_text)

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
texts_tfidf = vectorizer.fit_transform(new_texts_combined)

In [6]:
terms = vectorizer.get_feature_names()
ntopics=20

def word_topic(tfidf,solution, wordlist):
    
    words_by_topic=tfidf.T * solution
    components=pd.DataFrame(words_by_topic,index=wordlist)
    return components

def top_words(components, n_top_words):
    n_topics = components.shape[1]
    topwords = []
    for column in range(n_topics):
        sortedwords=components.iloc[:,column].sort_values(ascending=False)
        chosen=sortedwords[:n_top_words]
        topwords.append(chosen)
    return(topwords)

n_top_words = 10


In [7]:
# LSA

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

svd= TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
texts_lsa = lsa.fit_transform(texts_tfidf)

components_lsa = word_topic(texts_tfidf, texts_lsa, terms)

topwords = []
topwords.append(top_words(components_lsa, n_top_words))            


In [8]:
# LDA
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.1, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=100, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )

texts_lda = lda.fit_transform(texts_tfidf) 

components_lda = word_topic(texts_tfidf, texts_lda, terms)

topwords.append(top_words(components_lda, n_top_words))


In [9]:
# NNMF

from sklearn.decomposition import NMF

nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
texts_nmf = nmf.fit_transform(texts_tfidf) 

components_nmf = word_topic(texts_tfidf, texts_nmf, terms)

topwords.append(top_words(components_nmf, n_top_words))

In [12]:
for topic in range(ntopics):
    print('Topic {}:'.format(topic))
    print('   ---   ---   ---   ')
    print('LSA\t\t LDA\t\t NNMF')
    for i in range(10):
        tab1 = '\t\t' if len(topwords[0][topic].index[i]) < 7 else '\t'
        tab2 = '\t\t' if len(topwords[1][topic].index[i]) < 6 else '\t'

        print(topwords[0][topic].index[i], tab1, topwords[1][topic].index[i], tab2, topwords[2][topic].index[i])
    print('\n')

Topic 0:
   ---   ---   ---   
LSA		 LDA		 NNMF
people 		 good 		 fbi
good 		 use 		 koresh
time 		 look 		 people
use 		 people 	 batf
say 		 work 		 say
work 		 time 		 start
want 		 mail 		 child
look 		 file 		 government
right 		 new 		 compound
new 		 window 	 gas


Topic 1:
   ---   ---   ---   
LSA		 LDA		 NNMF
window 		 new 		 space
file 		 good 		 time
drive 		 work 		 work
card 		 use 		 people
program 	 mail 		 use
run 		 look 		 thing
use 		 need 		 year
disk 		 time 		 good
software 	 people 	 problem
driver 		 window 	 say


Topic 2:
   ---   ---   ---   
LSA		 LDA		 NNMF
game 		 use 		 team
team 		 good 		 player
year 		 work 		 win
player 		 look 		 game
play 		 people 	 year
win 		 time 		 play
fan 		 want 		 good
hockey 		 problem 	 fan
baseball 	 mail 		 season
season 		 say 		 hockey


Topic 3:
   ---   ---   ---   
LSA		 LDA		 NNMF
key 		 people 	 key
government 	 good 		 chip
chip 		 new 		 encryption
encryption 	 look 		 clipper
clipper 	 use 		 government
gun 	

Ground truth topics:
talk.politics.mideast
comp.graphics
talk.politics.guns
talk.politics.misc
talk.religion.misc
comp.sys.mac.hardware
comp.windows.x
misc.forsale
rec.autos
rec.motorcycles
rec.sport.baseball
rec.sport.hockey
sci.crypt
sci.electronics
sci.med
sci.space
soc.religion.christian
comp.os.ms-windows.misc
comp.sys.ibm.pc.hardware
alt.atheism

LSA
Topic 0: These are common words that don't seem to have a theme. spread: 20
1: definitely computer related, there are 5 computer topics so could be any. 5
2: sports related. There are 2 sports categories and it seems to overlap between both. 2
3: Politics guns? also has encryption so could be politics misc or maybe some bleeding into a computer topic or science electronics. Maybe just sci.crypt? 4.
4: electronics, hardware?, comp graphics? 5.
5: god mixed in with electronics. 8.
6: misc for sale! Very clear 1 topic. 1.
7: Weird mix of topics here 10.
8: Another mix. 10.
9: politics, mideast for sure. 1.
10: Mix of cars and mideast politics 2.
11: mideast politics. 1.
12: cars and software. 3.
 -- overall a lot of topics seem to be a mix of a few ground truth topics --
 
 LDA
 Awful algorithm, maybe I should try tweaking the parameters

NNMF
0: Politics misc?
1: sci space
2: sport hockey
3: sci crypt
4: sci electronics
5: soc religion christian
6: rec autos
7: comp graphics
8: alt atheism or talk religion misc
9: talk politics misc
10: comp os ms-windows
11: talk politics mideast
12: talk politics guns
13: comp windows x??
14: might be a mix of different articles
15: mac hardware
16: sci med
17: rec motorcycles
18: homosexuality topics, might be in a couple spots
19: rec sport baseball

NNMF is blowing the other algoritms out of the water!!!