# Topic Modeling

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from nltk.tokenize import TreebankWordTokenizer, TweetTokenizer
from nltk.stem import PorterStemmer
from nltk import SnowballStemmer
import nltk

from pipeline import NLPPipe, tweet_clean1

from helper_functions import txt_to_df

import pickle
%load_ext autoreload
%autoreload 2

In [None]:
all_tweets = pd.read_pickle("all_tweets.pkl")
corpus_list = all_tweets['long_text'].tolist()
# Let's turn all of the tweets into a list, so our Pipeline
# can work with our data better

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('vegan')
stopwords.append('http')
stopwords.append('plantbas')
# Add those terms to the stopwords

In [None]:
nlp2 = NLPPipe(vectorizer=TfidfVectorizer(stop_words=stopwords, max_df=0.80, min_df=10), 
              tokenizer=TweetTokenizer().tokenize, 
              stemmer=SnowballStemmer("english"),
              cleaning_function=tweet_clean1)

In [None]:
nlp2.fit(corpus_list)
dtm_tfidf = nlp2.transform(corpus_list)

In [None]:
nmf_model = NMF(9)
doc_topic = nmf_model.fit_transform(dtm_tfidf)

In [None]:

def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(nmf_model, nlp2.vectorizer.get_feature_names(), 15)

In [None]:
H = pd.DataFrame(doc_topic.round(5),
             index = corpus_list,
             columns = ["0","1",'2','3','4','5','6','7','8'])
H

In [None]:
H_topic = H.idxmax(axis=1)

In [None]:
H_topicdddd