# Data Cleaning and ML

In [2]:
NUM_DOCS = 1000
NUM_THREADS = 4
BATCH_SIZE = NUM_DOCS/NUM_THREADS

In [3]:
import gensim.downloader as api

# Downloads dataset locally on your machine, which you can read directly
# Won't re-download if the dataset is already there
dataset = api.load("20-newsgroups")

## Pre-processing

### Load Data

In [4]:
import json
import gzip
from pathlib import Path

# Gensim data should load data to the ~/gensim-data directory
home = str(Path.home())
dataPath = home + "/gensim-data/20-newsgroups/20-newsgroups.gz"

corpusJson = []
file = gzip.open(dataPath)
corpusJson = [json.loads(line) for line in file]
file.close()

### Extract data

In [5]:
# Extract the corpus data and topic labels from Dataset
corpus = list(map(lambda jsonDoc: jsonDoc["data"], corpusJson))
corpusTopics = list(map(lambda jsonDoc: jsonDoc["topic"], corpusJson))

### Clean Data (stemming, stop word removal)

In [11]:
%%time
import spacy

def addStopWords(stopWords, nlp):
    for stopWord in stopWords:
        lexeme = nlp.vocab[stopWord]
        lexeme.is_stop = True
        
def filterWords(word):
    return word.is_alpha and not word.is_stop

def convertWords(word):
    return word.lemma_.lower()

def cleanDoc(doc):
    return list(map(convertWords, filter(filterWords, doc)))

nlp = spacy.load("en", disable=['tagger', 'parser', 'ner', 'textcat'])
# Currently adding no other stop words
addStopWords([], nlp) 


nlpCorpus = []
for doc in nlp.pipe(corpus[:NUM_DOCS], n_threads=NUM_THREADS, batch_size=BATCH_SIZE):
    nlpCorpus.append(doc)
    
cleanCorpus = list(map(lambda doc: cleanDoc(doc), nlpCorpus))

CPU times: user 8.56 s, sys: 54.4 ms, total: 8.62 s
Wall time: 8.63 s


In [19]:
print(cleanCorpus[0])
print(cleanCorpus[0][0])
# [print(token.text) for token in cleanCorpus[0]]

['from', 'andrew', 'byler', 'subject', 're', 'serbian', 'genocide', 'work', 'god', 'organization', 'freshman', 'civil', 'engineering', 'carnegie', 'mellon', 'pittsburgh', 'pa', 'lines', 'vera', 'shanti', 'noyes', 'write', 'indicate', 'believe', 'predestination', 'correct', 'believe', 'predestination', 'believe', 'choose', 'accept', 'god', 'gift', 'salvation', 'fundamental', 'difference', 'resolve', 'of', 'course', 'i', 'believe', 'predestination', '-pron-', 'biblical', 'doctrine', 'roman', 'show', 'passage', 'furthermore', 'church', 'teach', 'predestination', 'begin', 'but', 'i', 'believe', 'predestination', 'mean', 'i', 'believe', 'free', 'men', 'freely', 'choose', 'course', 'life', 'affect', 'grace', 'god', 'however', 'unlike', 'calvinists', 'jansenists', 'i', 'hold', 'grace', 'resistable', 'end', 'idiocy', 'deny', 'universal', 'save', 'god', 'timothy', 'for', 'god', 'grace', 'save', 'but', 'elect', 'foreknow', 'predestine', 'receive', 'grace', 'final', 'perserverance', 'guarantee', 

### Add bigrams

In [6]:
%%time
from gensim.models.phrases import Phrases

# We need to mess around with these
# threshold = 1
# minCount = 1
bigram = Phrases(cleanCorpus)#, min_count=minCount, threshold=threshold)
cleanBigramCorpus = [bigram[doc] for doc in cleanCorpus]

CPU times: user 735 ms, sys: 8.37 ms, total: 743 ms
Wall time: 742 ms


### Create BOW and tfidf

In [7]:
%%time
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel

dictionary = Dictionary(cleanBigramCorpus)
bow = [dictionary.doc2bow(doc) for doc in cleanBigramCorpus]

tfidfModel = TfidfModel(bow)  # fit model
tfidf = [tfidfModel[doc] for doc in bow]

doc2Vec = [dictionary.doc2bow(doc) for doc in cleanBigramCorpus]

CPU times: user 692 ms, sys: 9.78 ms, total: 702 ms
Wall time: 701 ms


### Train LDA model

In [8]:
%%time
from gensim.models import LdaModel

ldamodel = LdaModel(bow, num_topics=10, id2word=dictionary)

CPU times: user 2.2 s, sys: 88.4 ms, total: 2.29 s
Wall time: 1.7 s


### Perform Clustering for Validation

In [9]:
%%time
from sklearn.cluster import KMeans
import numpy as np

clusterData = []
maxLength = max(map(lambda doc: doc[-1][0], bow))
for doc in bow:
    docDict = dict(doc)
    vectorOnly = []
    for i in range(maxLength):
        vectorOnly.append(docDict.get(i, 0))
    clusterData.append(vectorOnly)

kmeans = KMeans(n_clusters=20).fit(clusterData)
# kmeans.labels_

CPU times: user 22.6 s, sys: 1.14 s, total: 23.8 s
Wall time: 13 s


### Verify Labels

In [10]:
%%time
from sklearn.metrics import adjusted_rand_score

# Similarity metric between groups
score = adjusted_rand_score(kmeans.labels_, corpusTopics[:NUM_DOCS])
print(score)

-0.000470476394326084
CPU times: user 2.5 ms, sys: 1.24 ms, total: 3.73 ms
Wall time: 2.61 ms


In [11]:
label1 = [1, 1, 1, 2, 3, 4, 4, 4, 5]
label2 = list(map(lambda i: str(i), label1))

adjusted_rand_score(label1, label2)

1.0