In [1]:
%run helper_functions.py
%run tweepy_wrapper.py
%run s3.py
%run mongo.py
%run df_functions.py

import pandas as pd
import string
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words("english")+["rt", "via","-»","--»","--","---","-->","<--","->","<-","«--","«","«-","»","«»"]

### Step 1: Obtain my tweets!

I will obtain my entire tweet history! Note: For 2nd degree potential followers, I only extract 200 of their most recent tweets!

In [2]:
gabr_tweets = extract_users_tweets("gabr_ibrahim", 2000)

### Step 2: Create a dictionary from my tweets

This dictionary will have the same structure as our already collected 2nd degree followers

In [3]:
gabr_dict = dict()
gabr_dict['gabr_ibrahim'] = {"content" : [], "hashtags" : [], "retweet_count": [], "favorite_count": []}

for tweet in gabr_tweets:
    text = extract_text(tweet)
    hashtags = extract_hashtags(tweet)
    rts = tweet.retweet_count
    fav = tweet.favorite_count
    
    gabr_dict['gabr_ibrahim']['content'].append(text)
    gabr_dict['gabr_ibrahim']['hashtags'].extend(hashtags)
    gabr_dict['gabr_ibrahim']["retweet_count"].append(rts)
    gabr_dict['gabr_ibrahim']["favorite_count"].append(fav)

### Step 3: Create a dataframe from my tweets

We will now turn this dictionary into a dataframe - I do this as it allows me to utilise pandas in cleaning the content of my tweets!

After the cleaning on the 'content' column, I will convert the dataframe back into a dictionary.

In [4]:
gabr_tweets_df = pd.DataFrame.from_dict(gabr_dict, orient='index')

In [5]:
gabr_tweets_df.head()

Unnamed: 0,favorite_count,content,retweet_count,hashtags
gabr_ibrahim,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",[RT @UChicagoCAPP: Great turnout today! Hope y...,"[5, 1, 1065, 1, 0, 11, 27, 1407, 728, 1107, 0,...","[opendata, NLP, spaCy, Metis, TopicModelling, ..."


In [6]:
clean_gabr_tweets = filtration(gabr_tweets_df, "content")

In [7]:
clean_gabr_tweets = dataframe_to_dict(clean_gabr_tweets)

In [8]:
clean_gabr_tweets #this is a list of 1 dictionary

[{'gabr_ibrahim': {'content': ['great turnout today hope able join us slides available link video webinar coming soon',
    'ms capp student talks challenges value time',
    'good news steve bannon gone bad news replaced sentient swastika right arm permanently',
    'byyyeeeee',
    'late night could possibly better',
    'millennials killed confederate monument',
    'find friends speakers around add yourself directory',
    'in gop led homeland security committee declare charlottesville attack act domestic terrorism',
    "republicans extremely opposed erasing history unless it's named obama insures million americans",
    'mean ok',
    'this',
    'idea nazis people oppose nazis somehow equatable batshit fucking crazy shit i ever',
    'god grant serenity accept get grant, courage write anyway, wisdom know',
    'seriously though work castle',
    'sure, cancer aggressive chemotherapy also aggressive aggression sides',
    "problem another's solution; solution problem unknown",
  

### Step 4: LDA Analysis

Let's now move onto the LDA pre-processing stage and analysis!

In [9]:
import spacy
import nltk
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis
import pyLDAvis.gensim
from collections import Counter
from gensim.corpora.dictionary import Dictionary
nlp = spacy.load('en')

In [10]:
gabr_tweets = clean_gabr_tweets[0]['gabr_ibrahim']['content']

In [11]:
gabr_tweets[:5]

['great turnout today hope able join us slides available link video webinar coming soon',
 'ms capp student talks challenges value time',
 'good news steve bannon gone bad news replaced sentient swastika right arm permanently',
 'byyyeeeee',
 'late night could possibly better']

Let's now proceed to tokenize these tweets in addition to lemmatizing them! This will help improve the performance of our LDA model!

I will utilise spacy for this process as it is a production grade NLP library that is exceptionally fast!

In [12]:
tokenized_tweets = []
for tweet in gabr_tweets:
    tokenized_tweet = nlp(tweet)
    
    tweet = "" # we want to keep each tweet seperate
    
    for token in tokenized_tweet:
        if token.is_space:
            continue
        elif token.is_punct:
            continue
        elif token.is_stop:
            continue
        elif token.is_digit:
            continue
        elif len(token) == 1:
            continue
        elif len(token) == 2:
            continue
        else:
            tweet += str(token.lemma_) + " " #creating lemmatized version of tweet
        
    tokenized_tweets.append(tweet)
tokenized_tweets = list(map(str.strip, tokenized_tweets)) # strip whitespace
tokenized_tweets = [x for x in tokenized_tweets if x != ""] # remove empty entries

In [13]:
tokenized_tweets[:5] # you can see how this is different to the raw tweets!

['great turnout today hope able join slide available link video webinar come soon',
 'capp student talk challenge value time',
 'good news steve bannon go bad news replace sentient swastika right arm permanently',
 'byyyeeeee',
 'late night possibly better']

Lets now add these tokenized tweets to our dictionary!

In [14]:
clean_gabr_tweets[0]['gabr_ibrahim']['tokenized_tweets'] = tokenized_tweets

I will not turn the dictionary back into a dataframe, run it through the filtration function before re-casting the dataframe into a dictionary.

This time, we are running the filtration process on the tokenized tweets column and not the content column.

NLP models are very sensitive - ensuring consistent cleaning is important!

In [15]:
clean_gabr_tweets_df = pd.DataFrame.from_dict(clean_gabr_tweets[0], orient='index')

In [16]:
clean_gabr_tweets_df.head()

Unnamed: 0,favorite_count,content,tokenized_tweets,retweet_count,hashtags
gabr_ibrahim,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",[great turnout today hope able join us slides ...,[great turnout today hope able join slide avai...,"[5, 1, 1065, 1, 0, 11, 27, 1407, 728, 1107, 0,...","[opendata, NLP, spaCy, Metis, TopicModelling, ..."


In [17]:
clean_gabr_tweets_df = filtration(clean_gabr_tweets_df, "tokenized_tweets")

In [18]:
clean_gabr_tweets = dataframe_to_dict(clean_gabr_tweets_df)

In [19]:
clean_gabr_tweets[0]['gabr_ibrahim']['tokenized_tweets'][:5]

['great turnout today hope able join slide available link video webinar come soon',
 'capp student talk challenge value time',
 'good news steve bannon go bad news replace sentient swastika right arm permanently',
 'byyyeeeee',
 'late night possibly better']

### Gensim LDA Process

Fantastic - at this point, we have everything we need to proceed with LDA from the Gensim Library.

LDA via the Gensim library requires that our data be in a very specific format.

Broadly, LDA requires a Dictionary object that is later used to create a matrix called a corpus.

The Gensim LDA Dictionary will require that we pass in a list of lists. Every sublist will be a tweet that has been split.

Let's look at my first tweet as an example.

Before:

['great turnout today hope able join slide available link video webinar come soon', tweet 2, tweet 3, ...]

Correct Gensim Format:

[['great', 'turnout', 'today', 'hope', 'able', 'join', 'slide', 'available','link', 'video', 'webinar', 'come', 'soon'], [tweet 2 in split form], [...],...]




In [24]:
list_of_tweets_gabr = clean_gabr_tweets[0]['gabr_ibrahim']['tokenized_tweets']

In [25]:
gensim_format_tweets = []
for tweet in list_of_tweets_gabr:
    list_form = tweet.split()
    gensim_format_tweets.append(list_form)

In [26]:
gensim_format_tweets[:5]

[['great',
  'turnout',
  'today',
  'hope',
  'able',
  'join',
  'slide',
  'available',
  'link',
  'video',
  'webinar',
  'come',
  'soon'],
 ['capp', 'student', 'talk', 'challenge', 'value', 'time'],
 ['good',
  'news',
  'steve',
  'bannon',
  'go',
  'bad',
  'news',
  'replace',
  'sentient',
  'swastika',
  'right',
  'arm',
  'permanently'],
 ['byyyeeeee'],
 ['late', 'night', 'possibly', 'better']]

In [30]:
gensim_dictionary = Dictionary(gensim_format_tweets)

Now, I will now filter out extreme words - that is words that appear far too often and words that are rare.

In [31]:
gensim_dictionary.filter_extremes(no_below=10, no_above=0.4)
gensim_dictionary.compactify() # remove gaps after words that were removed

We now need to voctorize all the tweets so that it can be fed to the LDA algorithm! To do this, we will create a bag of words model from our tweets.

After putting all our tweets through this bag of words model, we will end up with a 'corpus' that represents all the tweets for a particular user. In this case, that user is myself.

We will save this corpus to disk as we go along! We will use the MmCorpus object from Gensim to achieve this.

In [32]:
!pwd

/home/igabr/new-project-4


In [35]:
file_path_corpus = "/home/igabr/new-project-4"

In [34]:
def bag_of_words_generator(lst, dictionary):
    assert type(dictionary) == Dictionary, "Please enter a Gensim Dictionary"
    for i in lst: 
        yield dictionary.doc2bow(i)

In [36]:
MmCorpus.serialize(file_path_corpus+"{}.mm".format("gabr_ibrahim"), bag_of_words_generator(gensim_format_tweets, gensim_dictionary))

In [37]:
corpus = MmCorpus(file_path_corpus+"{}.mm".format("gabr_ibrahim"))

In [39]:
corpus.num_terms # the number of terms in our corpus!

224

In [44]:
corpus.num_docs # the number of documets. These are the number of tweets!

1708

# Now for the LDA part!

I will be using the LDAMulticore class from gensim!

I set the passess parameter to 100 and the chunksize to 2000.

The chunksie will ensure it use's all the documents at once, and the passess parameter will ensure it looks at all the documents 100 times before converging.

As I am using my ENTIRE tweet history, I will create 30 topics!

I will adjust this to 10 when running lda on 2nd degree connections, as I will only have 200 of their tweets!

In [122]:
lda = LdaMulticore(corpus, num_topics=30, id2word=gensim_dictionary, chunksize=2000, workers=100, passes=100)

I can then save this lda model!

In [123]:
lda.save(file_path_corpus+"lda_model_{}".format("gabr_ibrahim"))

In [124]:
lda = LdaMulticore.load(file_path_corpus+"lda_model_{}".format("gabr_ibrahim"))

I now wish to extract all of the words that appear in each of the 30 topics that the LDA model was able to create.

For each word in a topic, I will ensure that it has a frequency not equal to 0.

I will place all these words into a list and then wrap a Counter object around it!


I am doing this as I want to see the distribution of words that appear accross all topics for a particular user. The LDA process will highlight key words that a particular user often uses in their twitter freed, across all topics that a particular user discusses. As such, the words they use will be indicitive of the topics a twitter user talks about!

The counter object will simply keep a count of how many times, out of a maximum of 30 (topics) a word appears, given it has a frequency greater than 0. That is, the word appears in a topic.

In [125]:
from collections import Counter

In [182]:
word_list = []

for i in range(30):
    for term, frequency in lda.show_topic(i, topn=100): #returns top 100 words for a topic
        if frequency != 0:
            word_list.append(term)
temp = Counter(word_list)

In [183]:
len(temp)

224

In [184]:
# This can be done later to help filter the important words.
important_words = []
for k, v in temp.items():
    if v >= 10:
        if k not in nltk_stopwords:
            doc = nlp(k)
            
            for token in doc:
                if not token.is_stop:
                    if len(token) != 2:
                        important_words.append(k)

In [185]:
important_words

['foreign',
 'refugee',
 'expert',
 'age',
 'russian',
 'woman',
 'clinton',
 'syria',
 'official',
 'talk',
 'share',
 'court',
 'london',
 'security',
 'turkey',
 'military',
 'ask',
 'record',
 'airport',
 'year',
 'find',
 'coup',
 'end',
 'video',
 'office',
 'today',
 'president',
 'republican',
 'know',
 'blast',
 'man',
 'government',
 'russia',
 'british',
 'political',
 'free',
 'week',
 'important',
 'work',
 'plan',
 'hillary',
 'parliament',
 'control',
 'news',
 'public',
 'syrian',
 'remember',
 'late',
 'second',
 'student',
 'tell',
 'read',
 'fuck',
 'scotland',
 'police',
 'love',
 'islamic',
 'support',
 'update',
 'think',
 'datum',
 'policy',
 'time',
 'create',
 'far',
 'machine',
 'hit',
 'post',
 'follow',
 'cnn',
 'order',
 'come',
 'new',
 'open',
 'country',
 'change',
 'good',
 'right',
 'night',
 'die',
 'national',
 'confirm',
 'turnout',
 'isis',
 'medium',
 'live',
 'long',
 'tax',
 'issue',
 'hostage',
 'stop',
 'ankara',
 'like',
 'future',
 'leave',


In [186]:
len(important_words)

182

I will then place this LDA Counter Object back into our dictionary!

We will then pickle this object - we will use it again for our TF-IDF analysis!

Be sure to look at the file called lda.py to see how I stuructured the code to run through the 2nd degree connections!

In [215]:
clean_gabr_tweets[0]['gabr_ibrahim'].keys()

dict_keys(['favorite_count', 'content', 'hashtags', 'retweet_count', 'tokenized_tweets'])

In [216]:
clean_gabr_tweets[0]['gabr_ibrahim']['LDA'] = temp

In [217]:
pickle_object(clean_gabr_tweets, "gabr_ibrahim_tweets_LDA_Complete")