# Testing the Pipeline from Class

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer
import nltk

from pipeline import NLPPipe, tweet_clean1

from helper_functions import txt_to_df

import pickle
%load_ext autoreload
%autoreload 2

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
all_tweets = pd.read_pickle("all_tweets.pkl")

In [None]:
corpus_list = all_tweets['long_text'].tolist()
# Let's turn all of the tweets into a list, so our Pipeline
# can work with our data better

In [None]:
nlp = NLPPipe(vectorizer=CountVectorizer(), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer())
# Assign our Pipeline to a variable

In [None]:
nlp.fit(corpus_list)
nlp.transform(corpus_list);
# Fit the corpus and transform the corpus

In [None]:
pd.DataFrame(nlp.transform(corpus_list).toarray(), columns=nlp.vectorizer.get_feature_names()).head()
# It looks like we have a basic Document Term matrix, but all of the terms shown seem pretty wrong.
# A good first step would be to take out strings with number, but let's see if there is anything that seems off.

In [None]:
nlp.vectorizer.vocabulary_['the']
# I didn't pass english stop words into the CountVectorizer, so that could be a good step to help out as well.

In [None]:
nlp = NLPPipe(vectorizer=CountVectorizer(stop_words='english', max_df=0.80, min_df=10), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer(),
              cleaning_function=tweet_clean1)
# Now let's add those stopwords in there, and change the cleaning function.

In [None]:
nlp.fit(corpus_list)
nlp.transform(corpus_list);
# Fit the corpus and transform the corpus

In [None]:
dtm = pd.DataFrame(nlp.transform(corpus_list).toarray(), columns=nlp.vectorizer.get_feature_names())

In [None]:
dtm.sum(0).sort_values(ascending=False)
# I think taking out vegan, http and plantbas is okay, since all of these tweets are about these specific topics.

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
stopwords.append('vegan')
stopwords.append('http')
stopwords.append('plantbas')
# Add those terms to the stopwords

In [None]:
nlp = NLPPipe(vectorizer=CountVectorizer(stop_words=stopwords, max_df=0.80, min_df=10), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer(),
              cleaning_function=tweet_clean1)
# Now let's add those stopwords in there, and change the cleaning function.

In [None]:
nlp.fit(corpus_list)
dtm_tf = nlp.transform(corpus_list)

In [None]:
nlp2 = NLPPipe(vectorizer=TfidfVectorizer(stop_words=stopwords, max_df=0.80, min_df=10), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer(),
              cleaning_function=tweet_clean1)

In [None]:
nlp2.fit(corpus_list)
dtm_tfidf = nlp2.transform(corpus_list)
# Fit the corpus and transform the corpus

Let's do some basic topic modeling

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn

In [None]:
lda_tf = LatentDirichletAllocation(n_components=5, random_state=0)
lda_tf.fit(dtm_tf)
# Visualize our normal Count Vectorized model

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, nlp.vectorizer)

In [None]:
lda_tfidf = LatentDirichletAllocation(n_components=10, random_state=0)
lda_tfidf.fit(dtm_tfidf)
# Visual our normalized Vectorized model

In [44]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, nlp2.vectorizer)

KeyboardInterrupt: 

After further research, it seems as though NMF is better for smaller documents and smaller amounts of data, so it may predict topics better for tweets and this specific corpus.

In [None]:
nmf_model = NMF(10)
doc_topic = nmf_model.fit_transform(dtm_tfidf)

In [None]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["top1","top2","top3","top4","top5","top6","top7","top8","top9","top10"],
             columns = nlp2.vectorizer.get_feature_names())
topic_word

In [None]:
display_topics(nmf_model, nlp2.vectorizer.get_feature_names(), 15)

In [None]:

def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))