# Processing Data

In [1]:
import psycopg2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer

## Outline for data processing as outlined in paper:
 1. Remove Stopwords
 2. Frequency Filtering
 3. Dictionary-based Filtering
 4. Topic Checking
 5. Embedding Expansion
 6. Embedding Filtering
 7. User Usage Filtering

In [2]:
comments = pd.read_csv('../data/comments_general.csv')
comments.dropna(inplace=True)

In [3]:
# NLTK has 
eng_dict = set(words.words())
lemma = WordNetLemmatizer()

In [4]:
comments['text'][0]

'No but HUD really will only help single mothers, pregnant and expecting women, the disabled, and veterans, if you’re not one of those you’re about to have a LONG wait..'

In [129]:
[lemma.lemmatize(word) for word in comments['text'][0].lower().split()]

['no',
 'but',
 'hud',
 'really',
 'will',
 'only',
 'help',
 'single',
 'mothers,',
 'pregnant',
 'and',
 'expecting',
 'women,',
 'the',
 'disabled,',
 'and',
 'veterans,',
 'if',
 'you’re',
 'not',
 'one',
 'of',
 'those',
 'you’re',
 'about',
 'to',
 'have',
 'a',
 'long',
 'wait..']

In [6]:
stops = set(stopwords.words('english'))
# no_stops = comments.apply(lambda x: [for word in x])

In [7]:
cntvec = CountVectorizer(stop_words=stops, strip_accents='unicode')
word_document_matrix = cntvec.fit_transform(comments['text'][:50])
features = cntvec.get_feature_names()

In [99]:
mask = [word not in eng_dict for word in features]
feat_mask = np.array(features)[mask]
word_document_matrix[:, mask]

<50x329 sparse matrix of type '<class 'numpy.int64'>'
	with 329 stored elements in Compressed Sparse Row format>

In [100]:
feat_mask

array(['1st', '3x9', 'accounts', 'arrived', 'balls', 'bombay', 'bored',
       'btw', 'caps', 'cats', 'chillin', 'clasps', 'complaints', 'crows',
       'damaged', 'decor', 'defo', 'died', 'dropped', 'drowned', 'eco',
       'ends', 'etc', 'expecting', 'favourite', 'gazillion', 'gets',
       'gme', 'helps', 'hemming', 'horses', 'hunted', 'hunter', 'inches',
       'instagram', 'intestines', 'iphone', 'keeps', 'kitties', 'knees',
       'liked', 'lives', 'lol', 'looks', 'loved', 'makes', 'meows',
       'michael', 'mothers', 'needed', 'omggggg', 'online', 'op',
       'origional', 'packaged', 'pairs', 'patches', 'pendants', 'phrased',
       'pieces', 'pleased', 'poo', 'posts', 'pouri', 'prices', 'products',
       'realises', 'reminds', 'scott', 'scrubs', 'scrunchie', 'seems',
       'shreds', 'snacks', 'soooo', 'straps', 'subscribed', 'tears',
       'things', 'threads', 'toys', 'treats', 'trekked', 'tubes', 'uhaul',
       'veterans', 'vets', 'wanted', 'women', 'worms'], dtype='<U12

In [56]:
reddit_lda = LatentDirichletAllocation(n_components=10)
reddit_lda.fit(word_document_matrix)


LatentDirichletAllocation()

In [77]:
lda_out = reddit_lda.transform(word_document_matrix)

In [90]:
lda_out.argmax(axis=1)

array([4, 0, 0, 7, 0, 9, 8, 0, 0, 0, 8, 0, 1, 7, 2, 9, 0, 7, 9, 5, 7, 1,
       9, 7, 6, 4, 5, 5, 0, 9, 4, 5, 6, 1, 7, 5, 9, 4, 3, 2, 5, 6, 8, 0,
       4, 4, 3, 9, 9, 3])

In [82]:
reddit_lda.components_

array([[0.1, 0.1, 0.1, ..., 0.1, 0.1, 0.1],
       [0.1, 0.1, 0.1, ..., 0.1, 0.1, 0.1],
       [0.1, 0.1, 0.1, ..., 0.1, 0.1, 0.1],
       ...,
       [0.1, 0.1, 0.1, ..., 0.1, 0.1, 0.1],
       [0.1, 0.1, 0.1, ..., 0.1, 0.1, 0.1],
       [0.1, 0.1, 0.1, ..., 2.1, 0.1, 1.1]])

In [64]:
np.array(features)[np.argmax(reddit_lda.components_, axis=1)]

array(['company', 'cat', 'cat', 'cat', 'cat', 'bird', 'dollar', 'looks',
       'context', 'like'], dtype='<U12')

In [35]:
for i in range(10):
    print(pd.DataFrame(reddit_lda.components_, columns=features).T.sort_values(i, ascending=False)[i][:10])

looks       2.100019
one         1.100002
bought      1.100002
great       1.100002
arrived     1.100001
broken      1.100000
wanted      1.100000
came        1.100000
pendants    1.100000
circular    1.100000
Name: 0, dtype: float64
company       3.100000
swimsuit      2.100001
know          2.100000
meows         2.100000
still         1.100001
considered    1.100000
protect       1.100000
products      1.100000
clasps        1.100000
complaints    1.100000
Name: 1, dtype: float64
cat          3.099867
help         2.100000
good         1.100004
women        1.100000
wait         1.100000
disabled     1.100000
single       1.100000
mothers      1.100000
veterans     1.100000
expecting    1.100000
Name: 2, dtype: float64
cat        2.100036
thought    1.100008
taking     1.100006
better     1.100004
love       1.100001
long       1.100001
straps     1.100000
cut        1.100000
tied       1.100000
time       1.100000
Name: 3, dtype: float64
material    1.100013
way         1.100010
ne

In [130]:
import gensim

In [131]:
d2v = gensim.models.Doc2Vec()

In [10]:
corpus = list(comments['text'])
corpus = [comment.lower().split() for comment in corpus]

In [11]:
corpus[0]

['no',
 'but',
 'hud',
 'really',
 'will',
 'only',
 'help',
 'single',
 'mothers,',
 'pregnant',
 'and',
 'expecting',
 'women,',
 'the',
 'disabled,',
 'and',
 'veterans,',
 'if',
 'you’re',
 'not',
 'one',
 'of',
 'those',
 'you’re',
 'about',
 'to',
 'have',
 'a',
 'long',
 'wait..']