In [28]:
import numpy as np
import pandas as pd

import os
import re
import spacy
import string
import itertools
from pprint import pprint
from tqdm.notebook import tqdm

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet

# NLTK
import nltk
from nltk import bigrams
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter
import collections

import matplotlib.pyplot as plt

import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [29]:
# Load the optimal model
import pickle
os.chdir(r'D:\DL_tweets_Classification\4_Topic_modelling\Data\Against')
with open('modellist_final_full.txt', 'rb') as f:
    model_list = pickle.load(f)
with open('cohere_final_full.txt', 'rb') as f:
    coherence_values = pickle.load(f)

optimal_model = model_list[4]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=20))

[(0,
  '0.052*"scienc" + 0.039*"scientist" + 0.022*"fact" + 0.021*"real" + '
  '0.015*"datum" + 0.014*"theori" + 0.012*"wrong" + 0.012*"evid" + '
  '0.011*"scientif" + 0.011*"true" + 0.010*"mani" + 0.010*"point" + '
  '0.010*"claim" + 0.010*"proof" + 0.009*"alarmist" + 0.009*"truth" + '
  '0.009*"report" + 0.009*"predict" + 0.009*"manmad" + 0.008*"research"'),
 (1,
  '0.071*"year" + 0.039*"time" + 0.039*"earth" + 0.036*"man" + 0.031*"ice" + '
  '0.028*"human" + 0.019*"natur" + 0.018*"planet" + 0.017*"age" + 0.012*"god" '
  '+ 0.011*"life" + 0.011*"due" + 0.010*"sun" + 0.010*"part" + 0.010*"littl" + '
  '0.009*"caus" + 0.009*"end" + 0.008*"long" + 0.008*"histori" + 0.008*"cycl"'),
 (2,
  '0.041*"weather" + 0.033*"cold" + 0.026*"day" + 0.019*"snow" + 0.017*"today" '
  '+ 0.017*"hot" + 0.015*"winter" + 0.013*"degre" + 0.013*"warm" + '
  '0.013*"hurrican" + 0.012*"real" + 0.012*"water" + 0.011*"record" + '
  '0.011*"high" + 0.010*"week" + 0.009*"air" + 0.009*"heat" + 0.009*"fuck" + '
  '0.

In [3]:
os.chdir(r'D:\DL_tweets_Classification\4_Topic_modelling\Data\Against')
tweetdf = pd.read_csv("tweetdf_against_full.csv")
tweetdf.dropna(axis=0, how='any', inplace=True)

# Smililar text preprocessing

In [4]:
def tweet_preprocessor(tweet):

    tweet = tweet.replace('\n', ' ') # remove line breaks
    tweet = re.sub(r"\bhttps://t.co/\w+", '', tweet) # remove URL's
    tweet = re.sub('\w*\d\w*', ' ', tweet) # remove numbers      
    
    r = re.findall("@[\w]* ", tweet) # Remove twitter handles (@users)
    for i in r:
        tweet = re.sub(i, '', tweet)
        
    r = re.findall("#[\w]*", tweet) # Remove twitter hashtages
    for i in r:
        tweet = re.sub(i, '', tweet)
        
    r = re.findall("'s", tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)    
    
    r = re.findall("won't", tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    
    r = re.findall("don't", tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
        
    r = re.findall("doesn't", tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    
    r = re.findall("didn't", tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    
    r = re.findall("isn't", tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    
    r = re.findall("'re", tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
        
    r = re.findall("'m", tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)  
            
    r = re.findall("lol", tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)

    tweet = re.sub('[%s]' % re.escape(string.punctuation), '', tweet.lower()) # lower capital letters and remove punctuation 
    return tweet

In [5]:
preprocessed_tweet = []
for index, values in tqdm(tweetdf['Tweet'].iteritems()):
    preprocessed = tweet_preprocessor(values)    
    preprocessed_tweet.append(preprocessed)
tweetdf.insert(1, column = 'preprocessed_text', value = preprocessed_tweet)

0it [00:00, ?it/s]

In [6]:
tweetdf = tweetdf.reset_index()
def tokenize(tweet):
    for word in tweet:
        yield(gensim.utils.simple_preprocess(str(word), deacc=True))  # deacc=True Removes punctuations
        
tweetdf['tidy_tweet_tokens'] = list(tokenize(tweetdf['preprocessed_text']))

In [7]:
# Prepare Stop Words
#nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'https', 'twitter', 'twitt', 'climate change', 'global warming',
                   'climate', 'change', 'global', 'warming', 'instagram', 'lol','good','bad',
                   'people', 'thing', 'news', 'tweet'])
def remove_stopwords(tweets):
    return [[word for word in simple_preprocess(str(tweet)) if word not in stop_words] for tweet in tweets]
tweetdf['tokens_no_stop'] = remove_stopwords(tweetdf['tidy_tweet_tokens'])

In [8]:
# Join the tweet back together
def rejoin_words(row):
    words = row['tokens_no_stop']
    joined_words = (" ".join(words))
    return joined_words
tweetdf['no_stop_joined'] = tweetdf.apply(rejoin_words, axis=1)

In [9]:
data = tweetdf.no_stop_joined.values.tolist()

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

# Make Bigrams 
# Build the bigram model
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)

# Faster way to get a sentence clubbed as a bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [10]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# Form Bigrams
data_words_bigrams = make_bigrams(data_words)

In [11]:
def lemmatization(tweets, allowed_postags):    
    tweets_out = []
    for sent in tweets:
        doc = nlp(" ".join(sent)) 
        tweets_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return tweets_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [12]:
# Do lemmatization keeping only noun
tweetdf = tweetdf.reset_index()
tweetdf['lemmatized'] = pd.Series(lemmatization(data_words_bigrams, allowed_postags=['NOUN',"ADJ"]))
tweetdf.to_pickle('lemmatized_full.pkl')

In [15]:
# Remove very short tweets
tweetdf['length'] = tweetdf['lemmatized'].apply(len)
tweetdf = tweetdf.drop(tweetdf[tweetdf['length'] < 3].index)
tweetdf = tweetdf.drop(['length'], axis=1)

tweetdf.reset_index(drop=True, inplace=True)

In [16]:
tweetdf

Unnamed: 0,level_0,index,Tweet,preprocessed_text,datetime,retweet_count,final_location,type_local,final_lat,final_lon,user_id_str,tidy_tweet_tokens,tokens_no_stop,no_stop_joined,lemmatized
0,0,0,After Libs Blame West Coast Fires on Global Wa...,after libs blame west coast fires on global wa...,2018-08-13,38,highlands ranch co usa,local_add,39.553877,-104.969426,21684265,"[after, libs, blame, west, coast, fires, on, g...","[libs, blame, west, coast, fires, forester, sp...",libs blame west coast fires forester speaks,"[libs, blame, fires_forester]"
1,1,1,@BrainEvacuated @usagalatheart @krassenstein W...,we just a computerized bankingenergy grid glit...,2018-08-13,0,pittsburgh pa,local_add,40.441694,-79.990086,1004116039167332354,"[we, just, computerized, bankingenergy, grid, ...","[computerized, bankingenergy, grid, glitch, aw...",computerized bankingenergy grid glitch away ut...,"[computerized, bankingenergy, grid, utter, bar..."
2,2,2,@Saintsfan5348 @FoxNews Comparing Judea-Christ...,comparing judeachristian values to sharia law...,2018-08-13,0,new jersey usa,state_add,40.167060,-74.499870,911591139924471808,"[comparing, judeachristian, values, to, sharia...","[comparing, judeachristian, values, sharia, la...",comparing judeachristian values sharia law lud...,"[judeachristian, value, ludicrous, globalist, ..."
3,3,3,#GlobalWarming has created near record cold co...,has created near record cold conditions in s...,2018-08-13,38,goodyear az,local_add,33.435367,-112.357601,282220986,"[has, created, near, record, cold, conditions,...","[created, near, record, cold, conditions, stra...",created near record cold conditions strange no...,"[record, cold, condition, strange, excited, co..."
4,4,4,The funny part is that they are basing their a...,the funny part is that they are basing their a...,2018-08-13,2,nyc ny,local_add,40.712728,-74.006015,299878544,"[the, funny, part, is, that, they, are, basing...","[funny, part, basing, alarm, computer, program...",funny part basing alarm computer program compu...,"[funny, part, alarm, computer, program, comput..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782346,1003158,1004122,Guarantee the mainstream media outlets will ne...,guarantee the mainstream media outlets will ne...,2017-10-23,3793,pennsylvania usa,state_add,40.696298,-77.995133,4570518621,"[guarantee, the, mainstream, media, outlets, w...","[guarantee, mainstream, media, outlets, never,...",guarantee mainstream media outlets never menti...,"[guarantee_mainstream, medium, outlet, old, st..."
782347,1003159,1004123,"See, the way it's going climate change will on...",see the way it going climate change will only ...,2017-10-23,268,arizona usa,state_add,34.500300,-111.500980,890058174463594496,"[see, the, way, it, going, climate, change, wi...","[see, way, going, increase, desire, renewables...",see way going increase desire renewables russi...,"[way, russia, national_grid, arab]"
782348,1003160,1004124,Don't call this 100+ degree in late October we...,dont call this degree in late october weathe...,2017-10-23,0,los angeles ca,local_add,33.973951,-118.248405,348877502,"[dont, call, this, degree, in, late, october, ...","[dont, call, degree, late, october, weather, c...",dont call degree late october weather call opp...,"[degree, late, october, weather, call, opportu..."
782349,1003161,1004125,@SenJeffMerkley All #GOP cares about is not gi...,all cares about is not giving another damn pe...,2017-10-23,0,san diego,local_add,32.717420,-117.162773,298680916,"[all, cares, about, is, not, giving, another, ...","[cares, giving, another, damn, penny, central,...",cares giving another damn penny central banker...,"[damn, penny, central, banker, name]"


In [17]:
# Join the tweet back together
def rejoin_words(row):
    words = row['lemmatized']
    joined_words = (" ".join(words))
    return joined_words

tweetdf['lemmatized_joined'] = tweetdf.apply(rejoin_words, axis=1)
tweetdf.reset_index(drop=True, inplace=True)
stemmer = PorterStemmer()
tweetdf['stemmed'] = tweetdf['lemmatized'].apply(lambda x : [stemmer.stem(y) for y in x])

In [19]:
# Create Dictionary
id2word_stemmed = corpora.Dictionary(tweetdf['stemmed'])
# Create Corpus
tweets_stemmed = tweetdf['stemmed']
corpus_stemmed = [id2word_stemmed.doc2bow(tweet) for tweet in tweets_stemmed]

In [30]:
num_topics = 4
mallet_path = 'C:\\mallet\\bin\\mallet'
os.environ['MALLET_HOME'] = 'C:\\mallet'

#ldamallet_stemmed = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus_stemmed, num_topics=num_topics, id2word=id2word_stemmed)

def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each documen
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    return(sent_topics_df)

In [31]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, 
                                                  corpus=corpus_stemmed, 
                                                  texts=tweetdf['stemmed'])

In [33]:
df_dominant_topic = pd.concat([df_topic_sents_keywords, tweetdf], axis = 1)
df_dominant_topic = df_dominant_topic[["Dominant_Topic", "Perc_Contribution", "Topic_Keywords", "Tweet", "datetime", "retweet_count",
                  "final_location", "type_local", "final_lat", "final_lon","user_id_str"]]
df_dominant_topic.to_csv('dominant_topic_4.csv', index = False)

In [25]:
len(tweetdf['stemmed'])

782351

In [32]:
df_topic_sents_keywords

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords
0,4.0,0.2138,"liber, polit, democrat, fire, agenda, medium, ..."
1,1.0,0.2022,"year, time, earth, man, ice, human, natur, pla..."
2,0.0,0.1879,"scienc, scientist, fact, real, datum, theori, ..."
3,2.0,0.2644,"weather, cold, day, snow, today, hot, winter, ..."
4,1.0,0.2090,"year, time, earth, man, ice, human, natur, pla..."
...,...,...,...
782346,4.0,0.1914,"liber, polit, democrat, fire, agenda, medium, ..."
782347,3.0,0.1761,"hoax, trump, gore, fake, stupid, presid, guy, ..."
782348,2.0,0.2341,"weather, cold, day, snow, today, hot, winter, ..."
782349,2.0,0.1852,"weather, cold, day, snow, today, hot, winter, ..."
