In [15]:
from tqdm._tqdm_notebook import tqdm_notebook
import spacy
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import random
from gensim import corpora
import pyLDAvis.gensim
tqdm_notebook.pandas()

In [1]:
spacy.load('en_core_web_sm')
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [3]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [4]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\husse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [6]:
df = pd.read_csv("data.csv")
df = df[['ID','text']]

In [10]:
df = pd.read_csv("data.csv")
df = df[['ID','text']]
df.text = df.text.apply(lambda x:' '.join(re.sub("(^|\s)((https?:\/\/)?[\w-]+(\.[\w-]+)+\.?(:\d+)?(\/\S*)?)"," ",x).split()))
df.text = df.text.apply(lambda x:' '.join(re.sub("(\w+/+\w*)|(\\+\w*)"," ",x).split()))
df.text = df.text.apply(lambda x:' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x).split()))
df.text = df.text.apply(lambda x: re.sub(u'http\S+', u'', x))   
df.text = df.text.apply(lambda x: x.replace(u'RT', u''))
df.drop_duplicates(subset="text",keep='first',inplace=True)

In [24]:
df['tokens'] = df['text'].progress_apply(lambda x:prepare_text_for_lda(x))

HBox(children=(IntProgress(value=0, max=13101), HTML(value='')))




In [27]:
text_data = df['tokens'].tolist()

In [29]:
dictionary = corpora.Dictionary(text_data)

In [30]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [31]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [32]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [33]:
ldamodel.print_topics(num_words=20)

[(0,
  '0.076*"lebanon" + 0.052*"protest" + 0.022*"government" + 0.021*"whatsapp" + 0.013*"economic" + 0.011*"tax" + 0.011*"beirut" + 0.009*"corruption" + 0.008*"crisis" + 0.007*"street" + 0.007*"third" + 0.007*"thousand" + 0.007*"call" + 0.007*"country" + 0.006*"lebanese" + 0.005*"across" + 0.005*"continue" + 0.004*"banks" + 0.004*"reform" + 0.004*"protester"'),
 (1,
  '0.080*"lebanon" + 0.070*"lebanonprotests" + 0.027*"lebanese" + 0.026*"people" + 0.025*"protest" + 0.018*"beirut" + 0.012*"protester" + 0.012*"revolution" + 0.009*"country" + 0.009*"government" + 0.007*"hezbollah" + 0.007*"force" + 0.007*"lebaneserevolution" + 0.007*"today" + 0.006*"hariri" + 0.006*"party" + 0.006*"political" + 0.005*"street" + 0.005*"lebanonprotest" + 0.005*"change"'),
 (2,
  '0.051*"lebanonprotests" + 0.041*"lebanon" + 0.022*"liban" + 0.014*"paris" + 0.008*"igshid" + 0.006*"barcelona" + 0.005*"beirut" + 0.005*"chile" + 0.005*"libanais" + 0.004*"france" + 0.004*"libanon" + 0.004*"manifestation" + 0.004

In [38]:
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.076*"lebanon" + 0.052*"protest" + 0.022*"government" + 0.021*"whatsapp" + 0.013*"economic"')
(1, '0.080*"lebanon" + 0.070*"lebanonprotests" + 0.027*"lebanese" + 0.026*"people" + 0.025*"protest"')
(2, '0.051*"lebanonprotests" + 0.041*"lebanon" + 0.022*"liban" + 0.014*"paris" + 0.008*"igshid"')
(3, '0.111*"lebanon" + 0.015*"syria" + 0.007*"egypt" + 0.007*"bassil" + 0.005*"beirut"')
(4, '0.059*"lebanonprotest" + 0.042*"lebanon" + 0.013*"tripoli" + 0.012*"lebanonprotests" + 0.006*"igshid"')


In [127]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.053*"lebanon" + 0.029*"lebanonprotests" + 0.009*"music" + 0.008*"beirut"')
(1, '0.087*"lebanon" + 0.019*"lebanonprotests" + 0.014*"protest" + 0.011*"change"')
(2, '0.059*"lebanon" + 0.036*"liban" + 0.019*"lebanonprotest" + 0.012*"lebanonprotests"')
(3, '0.057*"lebanon" + 0.026*"lebanonprotests" + 0.016*"corruption" + 0.015*"protest"')
(4, '0.054*"lebanon" + 0.010*"revolution" + 0.007*"forest" + 0.007*"yesterday"')
(5, '0.010*"lebanon" + 0.010*"lebanonprotests" + 0.008*"manifestants" + 0.006*"remain"')
(6, '0.123*"lebanon" + 0.032*"lebanonprotest" + 0.028*"protest" + 0.022*"government"')
(7, '0.069*"lebanon" + 0.028*"tripoli" + 0.024*"lebanonprotests" + 0.016*"revolution"')
(8, '0.089*"lebanonprotests" + 0.031*"lebanon" + 0.011*"force" + 0.009*"protester"')
(9, '0.059*"lebanon" + 0.037*"protester" + 0.015*"lebanonprotests" + 0.015*"protest"')


In [35]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [36]:
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [23]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [129]:
s = topics[0][1]

In [130]:
s[s.find("\"")+1:s.find("\"")]

''

In [131]:
re.findall('"([^"]*)"', s)

['lebanon', 'lebanonprotests', 'music', 'beirut']

In [39]:
words = set()
for topic in topics:
    words.update(re.findall('"([^"]*)"', topic[1]))

In [40]:
words

{'bassil',
 'beirut',
 'economic',
 'egypt',
 'government',
 'igshid',
 'lebanese',
 'lebanon',
 'lebanonprotest',
 'lebanonprotests',
 'liban',
 'paris',
 'people',
 'protest',
 'syria',
 'tripoli',
 'whatsapp'}

In [41]:
import numpy as np
np.save("words",words)

In [42]:
df["inf"] = df["text"].apply(lambda x:len(np.intersect1d(prepare_text_for_lda(x),list(words))))

In [43]:
df = df.sort_values(by="inf",ascending=False)

In [44]:
df

Unnamed: 0,ID,text,tokens,inf
832,1184917370881466368,People want the fall of the regime Protests er...,"[people, regime, protest, erupt, beirut, parts...",8
1558,1184974867466248192,Thousands of people demonstrated in Beirut on ...,"[thousand, people, demonstrate, beirut, thursd...",8
10475,1185578028727984131,Downtown Beirut is filled with people that gat...,"[downtown, beirut, fill, people, gather, prote...",7
15619,1185892535123828737,Thousands of people gathered in the Lebanese c...,"[thousand, people, gather, lebanese, capital, ...",7
12161,1185643337715507213,Protesters carry Lebanese flags as they shout ...,"[protester, carry, lebanese, flag, shout, gove...",7
879,1184920767181385728,This after the Lebanese Education Minister s b...,"[lebanese, education, minister, bodyguard, pro...",7
8107,1185457118511554562,Lebanon burns Strong protests against the gove...,"[lebanon, burns, strong, protest, government, ...",7
9283,1185519785917976578,BREAKING Oct 18 Beirut Lebanon Peaceful demons...,"[breaking, beirut, lebanon, peaceful, demonstr...",7
14176,1185825405405728768,Protesting and revolting against the corruptio...,"[protest, revolt, corruption, lebanese, govern...",7
6124,1185262989215182853,The second day of the Beirut people s protest ...,"[second, beirut, people, protest, increase, wh...",7


In [48]:
"\\n".join(df[df.inf>=7].text)

'People want the fall of the regime Protests erupt in Beirut and other parts of Lebanon following the government s decision to tax WhatsApp calls The economic crisis The fires Now this Lebanon LebanonProtests\\nThousands of people demonstrated in Beirut on Thursday against the Lebanese government s management of an economic crisis in one of the biggest protests in years leading the Cabinet to pull a proposed new levy on WhatsApp calls usands protest across lebanon over dire economy proposed whatsapp fee withdrawn Lebanon\\nDowntown Beirut is filled with people that gathered to protest against corruption and the oligarchy in the Lebanese government A protest for Lebanon not for any political party LebanonProtests  1185561099904659461\\nThousands of people gathered in the Lebanese capital Beirut hours after a junior party quit the country s fragile coalition government Dire economic conditions have blighted the heavily indebted country for years tests enter fourth day after party quits 5