# Data Cleaning and ML

### Download Data

In [98]:
from sklearn.datasets import fetch_20newsgroups

newsTrain = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), download_if_missing=True)
newsTest = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), download_if_missing=True)

## Preprocess Data

### Create an Example BOW Vectorizer

In [99]:
from nltk import download
download('stopwords')
download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer


stop = set(stopwords.words('english'))

def filterWords(word):
    return word not in stop and word.isalpha()

def customTokenizer(doc):
    words = [word for sentence in sent_tokenize(doc) for word in word_tokenize(sentence)]
    return(list(filter(lambda word: filterWords(word), words)))

nltkCountVectorizer = CountVectorizer(tokenizer=customTokenizer, ngram_range=(1,1), max_df=.5, min_df=5)

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfTransformer
from gensim.matutils import Sparse2Corpus
from gensim.models import LdaModel

def preprocessData(vectorizer, data):
    bow = vectorizer.fit_transform(data)
    tfidf = TfidfTransformer().fit_transform(bow)
    id2word = dict((id, word) for word, id in nltkCountVectorizer.vocabulary_.items())
    return bow, tfidf, id2word

In [113]:
def trainLDA(docRep, id2word):
    ldamodel = LdaModel(Sparse2Corpus(docRep), num_topics=20, id2word=id2word)

In [None]:
bow, tfidf, id2word = preprocessData(nltkCountVectorizer, newsTrain[:10])
ldaModelBOW = trainLDA(bow, id2word)
ldaModelTFIDF = trainLDA(bow, id2word)

In [None]:
from sklearn.cluster import KMeans
import numpy as np

clusterData = []
maxLength = max(map(lambda doc: doc[-1][0], bow))
for doc in bow:
    docDict = dict(doc)
    vectorOnly = []
    for i in range(maxLength):
        vectorOnly.append(docDict.get(i, 0))
    clusterData.append(vectorOnly)

kmeans = KMeans(n_clusters=20).fit(clusterData)
# kmeans.labels_

In [None]:
from sklearn.metrics import adjusted_rand_score

# Similarity metric between groups
score = adjusted_rand_score(kmeans.labels_, corpusTopics[:NUM_DOCS])
print(score)

In [122]:
dir(newsTrain)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [128]:
testData = ['hey all whatsup', 'dawg not too much']
test = CountVectorizer(tokenizer=None, ngram_range=(1,1), max_df=.5, min_df=1)
res = test.fit_transform(testData)

<class 'scipy.sparse.csr.csr_matrix'>
<class 'sklearn.feature_extraction.text.CountVectorizer'>
['__abs__', '__add__', '__array_priority__', '__bool__', '__class__', '__delattr__', '__dict__', '__dir__', '__div__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__idiv__', '__imul__', '__init__', '__init_subclass__', '__isub__', '__iter__', '__itruediv__', '__le__', '__len__', '__lt__', '__matmul__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__pow__', '__radd__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rmatmul__', '__rmul__', '__rsub__', '__rtruediv__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '_add_dense', '_add_sparse', '_arg_min_or_max', '_arg_min_or_max_axis', '_binopt', '_boolean_index_to_array', '_check_boolean', '_check_ellipsis', '_cs_matrix__get_has_canonical_format', '_cs_m

In [129]:
print(dir(newsTrain))

['DESCR', 'data', 'filenames', 'target', 'target_names']
