In [53]:
import os
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import numpy as np
import spacy
import re
import string
import logging
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from nltk import pos_tag
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [54]:
data_file = "../processed/comments.pkl"
df = pd.read_pickle(data_file)
df.head()

Unnamed: 0,comment
0,That was another administration.
1,Why is the ex CEO of Monsanto appointed head o...
2,"because he greased the palms of our ""elected"" ..."
3,the photo shown above is from a feed lot in Ca...
4,THAT IS NOT GRASS FED MEET! NO GRASS THERE! YO...


In [55]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text
#df_clean = pd.DataFrame(df.comment.apply(lambda x: clean_text(x)))
#df_clean.head()

In [56]:
def nouns_only(text):
    sent = []
    text = word_tokenize(text)
    pos_tagged = pos_tag(text)
    noun_tags = ['NN','NNS']
    nouns = filter(lambda x:x[1] in noun_tags,pos_tagged)
    for word in nouns:
        sent.append(word[0])
    return " ".join(sent)

#df_nouns = pd.DataFrame(df_clean.comment.apply(lambda x: nouns_only(x)))
#df_nouns.head()

In [57]:
nlp = spacy.load('en')
def lemmatizer(text):
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)

#df_clean = pd.DataFrame(df_clean.comment.apply(lambda x: lemmatizer(x)))
#df_clean['comment'] = df_clean['comment'].str.replace('-pron-', '')
#df_clean.head()


In [58]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    sent = []
    doc = nlp(text)
    for word in doc:
        if word.text not in stop_words:
            sent.append(word.text)
    return " ".join(sent)

#df_clean_stopwords = pd.DataFrame(df_clean.comment.apply(lambda x: remove_stopwords(x)))
#df_clean_stopwords.head()

In [61]:
processed_path = "../processed"
def save_file(df_comment, name):
    os.makedirs(processed_path, exist_ok=True)
    with open(os.path.join(processed_path, name), 'wb') as f:
        pickle.dump(df_comment, f)

def open_file(name):
    with open(os.path.join(processed_path, name), 'rb') as f:
        df_comment = pickle.load(f)

        return df_comment

#df_comment = save_file(df_clean,'lemmatize_clean_comments.pkl')

In [60]:
def preprocess(text, clean=True, noun=True, lemmatize=True, stopword=True):
    if clean:
        text = clean_text(text)
    if noun:
        text = nouns_only(text)
    if stopword:
        text = remove_stopwords(text)
    if lemmatize:
        text = lemmatizer(text)


    return text

#cc = 'I have a dream, I got everything I want it. When she was just a girls, she expects to the world'
#cc1 = preprocess(cc,clean=True, noun=True, lemmatize=True, stopword=True)
#print(cc1)

#df_noun_lematize = pd.DataFrame(df.comment.apply(lambda x: preprocess(x,clean=False, noun=True, lemmatize=True, stopword=False)))
#df_comment = save_file(df_noun_lematize,'noun_lematize_comments.pkl')
#df_noun_lematize_stopword = pd.DataFrame(df.comment.apply(lambda x: preprocess(x,clean=False, noun=True, lemmatize=True, stopword=True)))
#df_comment1 = save_file(df_noun_lematize_stopword,'noun_lematize_stopword_comments.pkl')
#df_clean_lematize_stopword = pd.DataFrame(df.comment.apply(lambda x: preprocess(x,clean=True, noun=False, lemmatize=True, stopword=True)))
#df_comment2 = save_file(df_clean_lematize_stopword,'clean_lematize_stopword_comments.pkl')

In [75]:
df_comment = open_file('clean_lematize_stopword_comments.pkl')

In [76]:
#most comment words
n_features = 4000

#number of topics/aspects
n_topics = 10

#seedwords
n_top_words = 10

# ignore terms that have a document frequency strictly higher than 95%,
# ignore terms that have a document frequency strictly lower than 2
tfidf_vectorizer = TfidfVectorizer(max_df=0.99, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df_comment['comment'].values.astype(str))
print(tfidf[1])

  (0, 1369)	0.31446509274101964
  (0, 1660)	0.3639240316816441
  (0, 183)	0.5057417371243024
  (0, 2320)	0.25814526621515405
  (0, 550)	0.42892423364130633
  (0, 1266)	0.5121340871962907


In [77]:
# alpha=0 means no regularization, l1_ratio=.5, the penalty is a combination of L1 and L2
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
nmf_output = nmf.fit_transform(tfidf)



In [78]:
def show_topics(vectorizer=tfidf_vectorizer, lda_model=nmf, n_top_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_top_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=tfidf_vectorizer, lda_model=nmf, n_top_words=n_top_words)

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]

# Topics_theme = ['Word start from ph', 'People/Friend/Relationship', 'Life/Experience/Love/Purpose', 'Money/Internet/Business',
#                 'Weekend/Parent/Child', 'Leisure time', 'Language/technique/software', 'Relationship/Girl/Boy',
#                 'Business relate to India, China or Pakistan', 'Friend/Love/Relationship', 'Difference and similarity/Language/Engineering',
#                 'Culture, travel and visa requirements in several countries', 'Tips on working as software engineering', 'Book/Movie/Class/History/Physics/Chemistry/Science',
#                 'Software engineer job opportunitis in Canada', 'Love/Life/Relationship', 'World/War/Language/History', 'Day/Hour/Week/Month/Sex/Place', 'School/Student/College/University',
#                 'Question/Answer/Quora/Interview']
# df_topic_keywords['topic_theme'] = Topics_theme
# df_topic_keywords.set_index('topic_theme', inplace=True)
# df_topic_keywords.T

df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,people,know,make,like,use,think,say,pron,need,time
Topic 1,organic,pesticide,use,farming,produce,mean,farm,conventional,certify,farmer
Topic 2,thank,share,post,info,information,great,article,love,god,sign
Topic 3,buy,local,product,stop,farmer,egg,store,market,anymore,brand
Topic 4,food,grow,store,healthy,process,health,shop,price,label,real
Topic 5,eat,meat,animal,stop,chicken,healthy,egg,diet,vegan,feed
Topic 6,yes,vote,right,oh,course,love,hell,answer,question,boycott
Topic 7,good,taste,thing,news,luck,point,idea,way,job,bad
Topic 8,milk,cow,raw,drink,dairy,almond,baby,love,calf,human
Topic 9,gmo,label,corn,monsanto,non,product,crop,seed,free,gmos
