In [None]:
#!pip install gensim
#!pip install spacy
#!pip install nltk
#!pip install pyLDAvis

In [1]:
import pandas as pd
import unicodedata
import re
import contractions
import string

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
#spacy
import spacy
from nltk.corpus import stopwords
#vis
import pyLDAvis
import pyLDAvis.gensim_models

In [17]:
def load_data(path):
    return pd.read_csv(path)
tweets_df=load_data("qatar_loc_data.csv")

In [18]:
tweets_df = tweets_df[["text","classification","country"]]
tweets_df = tweets_df.loc[tweets_df["country"]=="Germany"]

In [19]:
tweets_df = tweets_df.loc[tweets_df["classification"]=="negative"]

In [None]:
tweets_df

In [20]:
def to_lowercase(text):
    return text.lower()
#converting every row of the column into lower case 
tweets_df.text=tweets_df.text.apply(to_lowercase)

In [21]:
def standardize_accented_chars(text):
 return unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8", "ignore")
#testing the function on a single sample for explaination
print(standardize_accented_chars('Sómě words such as résumé, café, prótest, divorcé, coördinate, exposé, latté.'))
#standardizing accented characters for every row
tweets_df.text=tweets_df.text.apply(standardize_accented_chars)

Some words such as resume, cafe, protest, divorce, coordinate, expose, latte.


In [22]:
def get_number_of_urls(documents):
    print("{:.2f}% of documents contain urls".format(sum
(documents.apply(lambda x:x.find('http'))>0)/len
(documents)*100))
# Passing the 'text' column of the dataframe as the argument
print(get_number_of_urls(tweets_df.text)) 

51.25% of documents contain urls
None


In [23]:
def remove_url(text):
 return re.sub(r"https?:\S*", "", text)
#testing the function on a single sample for explaination
print(remove_url('using https://www.google.com/ as an example'))
#removing urls from every row
tweets_df.text=tweets_df.text.apply(remove_url)

using  as an example


In [24]:
def expand_contractions(text):
    expanded_words = [] 
    for word in text.split():
       expanded_words.append(contractions.fix(word)) 
    return " ".join(expanded_words)
#testing the function on a single sample for explaination
print(expand_contractions("Don't is same as do not"))
#expanding contractions for every row
tweets_df.text=tweets_df.text.apply(expand_contractions)

Do not is same as do not


In [25]:
def remove_mentions_and_tags(text):
    return re.sub(r"@\S*", "", text)
    #return re.sub(r"#\S*", "", text)
#testing the function on a single sample for explaination
print(remove_mentions_and_tags('Some random @abc and #def'))
#removing mentions and tags from every row
tweets_df.text=tweets_df.text.apply(remove_mentions_and_tags)

Some random  and #def


In [26]:
def keep_only_alphabet(text):
    return re.sub(r"[^a-z]", " ", text)
#testing the function on a single sample for explaination
print(keep_only_alphabet('Just a bit more $$processing #required.Just a bit!!!'))
#for all the rows
tweets_df.text=tweets_df.text.apply(keep_only_alphabet)

 ust a bit more   processing  required  ust a bit   


In [27]:
def remove_stopwords(text, nlp, custom_stop_words=None, remove_small_tokens=True, min_len=2):
    # If custom stop words are provided, add them to the default stop words list
    if custom_stop_words:
        nlp.Defaults.stop_words |= set(custom_stop_words)

    filtered_tokens = [] 
    doc = nlp(text)
    for token in doc:
        if not token.is_stop:
            # If small tokens have to be removed, select only those which are longer than min_len 
            if remove_small_tokens and len(token.text) <= min_len:
                continue
            filtered_tokens.append(token.text)

    # If after the stop word removal, words are still left in the sentence, then return the sentence as a string, else return None 
    return ' '.join(filtered_tokens) if filtered_tokens else ""

# Download spaCy model 'en_core_web_sm'
!python -m spacy download en_core_web_sm

# Creating a spaCy object
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 435.7 kB/s eta 0:00:30
     --------------------------------------- 0.0/12.8 MB 435.7 kB/s eta 0:00:30
     --------------------------------------- 0.0/12.8 MB 217.9 kB/s eta 0:00:59
     --------------------------------------- 0.0/12.8 MB 217.9 kB/s eta 0:00:59
     --------------------------------------- 0.1/12.8 MB 286.7 kB/s eta 0:00:45
     --------------------------------------- 0.1/12.8 MB 286.7 kB/s eta 0:00:45
     --------------------------------------- 0.1/12.8 MB

In [28]:
#removing stop-words and short words from every row
tweets_df.text=tweets_df.text.apply(lambda x:remove_stopwords(x,nlp,{"rt ","fifaworldcup","world","cup","worldcup","qatar"}))

In [29]:
tweets_df.loc[tweets_df["text"]==None]

Unnamed: 0,text,classification,country


In [30]:
def lemmatize(text, nlp):
   doc = nlp(text)
   lemmatized_text = []
   for token in doc:
       lemmatized_text.append(token.lemma_)
   return " ".join(lemmatized_text)
#testing the function on a single sample for explaination
print(lemmatize("Reading NLP blog is fun." ,nlp ))
#Performing lemmatization on every row
tweets_df.text=tweets_df.text.apply(lambda x:lemmatize(x,nlp))

read NLP blog be fun .


In [31]:
def generate_tokens(tweet):
    words=[]
    for word in tweet.split(" "):
    # using the if condition because we introduced extra spaces during text cleaning
        if word!="":
           words.append(word)
        return words
#storing the generated tokens in a new column named 'words'
tweets_df['tokens']=tweets_df.text.apply(generate_tokens)

In [32]:
def create_dictionary(words):
    return corpora.Dictionary(words)
#passing the dataframe column having tokens as the argument
id2word=create_dictionary(tweets_df.tokens)
print(id2word)

Dictionary<1756 unique tokens: ['great', 'half', 'nation', 'mad', 'israelis']...>


In [39]:
def create_document_matrix(tokens,id2word):
    corpus = []
    for text in tokens:
       corpus.append(id2word.doc2bow(text))
    return corpus
#passing the dataframe column having tokens and dictionary
corpus=create_document_matrix(tweets_df.tokens,id2word)

In [40]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
 id2word=id2word,
 num_topics=10,
 random_state=100,
 )

In [45]:
def get_lda_topics(model, num_topics, top_n_words):
    word_dict = {}
    for i in range(num_topics):
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in model.show_topic(i, topn=top_n_words)]

    return pd.DataFrame(word_dict)

get_lda_topics(lda_model,6,10)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06
0,fifa,basically,force,good,second,probably
1,know,game,germany,football,end,president
2,terrible,think,player,thank,fuck,country
3,shit,time,death,yeah,security,migrant
4,boycott,arab,like,video,see,big
5,course,go,fail,threaten,disgraceful,human
6,year,give,say,get,hey,bribe
7,qatar,tell,absolutely,focus,watch,day
8,major,interesting,buy,man,thing,way
9,glad,yes,international,love,imagine,reporter


In [46]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

