## Explanatory analysis of CNN's tweets over the year using LDA

Gathering/collecting the tweets using selenium and chromedriver

Reference - https://github.com/bpb27/twitter_scraping

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from time import sleep
import json
import datetime


# edit these three variables
user = 'CNN'
start = datetime.datetime(2017, 1, 3)  # year, month, day
end = datetime.datetime(2018, 3, 1)  # year, month, day

# only edit these if you're having problems
delay = 1  # time to wait on each page load before reading the page
driver = webdriver.Chrome()  # options are Chrome() Firefox() Safari()


# don't mess with this stuff
twitter_ids_filename = 'all_ids.json'
days = (end - start).days + 1
id_selector = '.time a.tweet-timestamp'
tweet_selector = 'li.js-stream-item'
user = user.lower()
ids = []

def format_day(date):
    day = '0' + str(date.day) if len(str(date.day)) == 1 else str(date.day)
    month = '0' + str(date.month) if len(str(date.month)) == 1 else str(date.month)
    year = str(date.year)
    return '-'.join([year, month, day])

def form_url(since, until):
    p1 = 'https://twitter.com/search?f=tweets&vertical=default&q=from%3A'
    p2 =  user + '%20since%3A' + since + '%20until%3A' + until + 'include%3Aretweets&src=typd'
    return p1 + p2

def increment_day(date, i):
    return date + datetime.timedelta(days=i)

for day in range(days):
    d1 = format_day(increment_day(start, 0))
    d2 = format_day(increment_day(start, 1))
    url = form_url(d1, d2)
    print(url)
    print(d1)
    driver.get(url)
    sleep(delay)

    try:
        found_tweets = driver.find_elements_by_css_selector(tweet_selector)
        increment = 10

        while len(found_tweets) >= increment:
            print('scrolling down to load more tweets')
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            sleep(delay)
            found_tweets = driver.find_elements_by_css_selector(tweet_selector)
            increment += 10

        print('{} tweets found, {} total'.format(len(found_tweets), len(ids)))

        for tweet in found_tweets:
            try:
                id = tweet.find_element_by_css_selector(id_selector).get_attribute('href').split('/')[-1]
                ids.append(id)
            except StaleElementReferenceException as e:
                print('lost element reference', tweet)

    except NoSuchElementException:
        print('no tweets on this day')

    start = increment_day(start, 1)


try:
    with open(twitter_ids_filename) as f:
        all_ids = ids + json.load(f)
        data_to_write = list(set(all_ids))
        print('tweets found on this scrape: ', len(ids))
        print('total tweet count: ', len(data_to_write))
except FileNotFoundError:
    with open(twitter_ids_filename, 'w') as f:
        all_ids = ids
        data_to_write = list(set(all_ids))
        print('tweets found on this scrape: ', len(ids))
        print('total tweet count: ', len(data_to_write))

with open(twitter_ids_filename, 'w') as outfile:
    json.dump(data_to_write, outfile)

print('all done here')
driver.close()

https://twitter.com/search?f=tweets&vertical=default&q=from%3Acnn%20since%3A2017-01-03%20until%3A2017-01-04include%3Aretweets&src=typd
2017-01-03


The above block captures all the tweet ids which would be then used to get the tweet information

In [None]:
import tweepy
import csv
import time

Initialize the credentials for the tweepy API
Storing all the tweets data in the list tweets

In [None]:
CONSUMER_KEY = ''
CONSUMER_SECRET = ""
OAUTH_TOKEN = ""
OAUTH_TOKEN_SECRET = ""

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
api = tweepy.API(auth)
tweets = []

Puts the processor to sleep after every 15 mins as the limit is 15 mins for the tweepy api

In [None]:
for id_no in ids:         # store the ids in this list
    #tweet = api.get_status(int(id_no))
    #tweets.append([tweet.id,tweet.created_at, tweet.text.encode("utf-8"),tweet.favorite_count,tweet.retweet_count])
    try:
        tweet = api.get_status(int(id_no))
        tweets.append([tweet.id,tweet.created_at, tweet.text.encode("utf-8"),tweet.favorite_count,tweet.retweet_count])
        # Insert into db
    except tweepy.TweepError:
        time.sleep(60 * 15)
        continue
    except StopIteration:
        break

Storing all the tweets information in a csv file

In [None]:
import pandas as pd
with open("tweetdata4_CNN.csv","w",encoding='utf-8') as f:
    writer = csv.writer(f, delimiter=",")
    writer.writerows(tweets)

In [2]:
import re
import pandas as pd
df6 = pd.read_csv('tweetdata4_CNN.csv',header=None,encoding = 'ISO-8859-1')

Adding a new column called subject which would be determined by using LDA

In [3]:
df6['subject'] = 0

In [4]:
df6.head()

Unnamed: 0,0,1,2,3,subject
0,2017-02-08 07:00:01,"b'In 2012 interviews, Steve Bannon compared pr...",168,140,0
1,2017-01-10 03:41:30,b'Sanders: We need to focus less on polls and ...,597,199,0
2,2018-01-12 20:00:19,"b""After Trump cited disapproval with London's ...",1640,497,0
3,2018-02-02 21:42:18,"b'""I think this is an absolute disgrace. I thi...",2463,973,0
4,2017-03-21 23:13:57,b'@lance_evans1 may we have permission to feat...,0,0,0


Example of a tweet - 'b\'police officer stands to lose job after he was
caught on dashcam footage telling a woman "we only kill black people\\x
e2\\x80\\xa6 https://t.co/i6youjpcsr\''

As we can see there are some gargabe values in every tweet which has to be removed

In [7]:
pattern = '(\\\\xe2\\\\\w{3}\\\\\w{3}|https://[a-zA-Z0-9./]+)'

In [8]:
tweets = []
for row_no in range(0,len(df6)):
    df6.iloc[row_no,1] = df6.iloc[row_no,1].lower()        # read the tweets in lower case
    x = re.sub(pattern,'',df6.iloc[row_no,1])              # substitute the garbage value with null
    tweets.append(re.findall(r'\w+',x[1:]))

Preparing the lemmatizer function

In [9]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preparing the text for the LDA function which does stopword removal,lemmatization and stemmatization

In [11]:
def prepare_text_for_lda(text):
    tokens = text
    tokens = [token for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [12]:
text_data = []
for tweet in tweets:
    tokens = prepare_text_for_lda(tweet)
    text_data.append(tokens)

In [13]:
len(text_data)

22431

In [14]:
import gensim



Performing the LDA 
Topics selected = 10 and passes = 20

In [15]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [16]:
import gensim
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=7)
for topic in topics:
    print(topic)

(0, '0.026*"north" + 0.025*"korea" + 0.018*"hurricane" + 0.017*"florida" + 0.014*"irma" + 0.013*"south" + 0.010*"harvey"')
(1, '0.010*"new" + 0.009*"say" + 0.008*"analysis" + 0.006*"election" + 0.006*"death" + 0.005*"state" + 0.005*"share"')
(2, '0.027*"trump" + 0.019*"say" + 0.014*"president" + 0.014*"republican" + 0.012*"rep" + 0.010*"democrat" + 0.010*"obamacare"')
(3, '0.107*"trump" + 0.070*"president" + 0.020*"donald" + 0.014*"ban" + 0.013*"travel" + 0.013*"say" + 0.010*"administration"')
(4, '0.048*"trump" + 0.027*"president" + 0.021*"say" + 0.015*"former" + 0.008*"clinton" + 0.008*"russia" + 0.007*"analysis"')
(5, '0.016*"health" + 0.011*"care" + 0.011*"paul" + 0.009*"2018" + 0.008*"ryan" + 0.007*"california" + 0.006*"speech"')
(6, '0.029*"new" + 0.012*"via" + 0.011*"000" + 0.011*"wall" + 0.009*"year" + 0.009*"border" + 0.009*"long"')
(7, '0.027*"year" + 0.016*"old" + 0.013*"die" + 0.011*"life" + 0.010*"world" + 0.009*"people" + 0.009*"first"')
(8, '0.019*"say" + 0.012*"first" +

Example of the weights assigned to each of the topic

In [17]:
new_doc = prepare_text_for_lda(tweets[4])
new_doc_bow = dictionary.doc2bow(new_doc)
#print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(0, 0.014286679), (1, 0.01428697), (2, 0.8714238), (3, 0.014285952), (4, 0.014285869), (5, 0.014286297), (6, 0.014287031), (7, 0.014285715), (8, 0.014285975), (9, 0.014285715)]


In [18]:
max(ldamodel.get_document_topics(new_doc_bow),key=lambda item:item[1])[0]   # gives the max value amongst the tuples

2

Assigning topic number to each tweet

In [180]:
for row_no in range(0,len(df6)):
    new_doc = prepare_text_for_lda(tweets[row_no])
    new_doc_bow = dictionary.doc2bow(new_doc)
    df6.iloc[row_no,4] = max(ldamodel.get_document_topics(new_doc_bow),key=lambda item:item[1])[0]

In [181]:
df6.head()

Unnamed: 0,0,1,2,3,cleaned
0,2017-02-08 07:00:01,"b'in 2012 interviews, steve bannon compared pr...",168,140,9
1,2017-01-10 03:41:30,b'sanders: we need to focus less on polls and ...,597,199,7
2,2018-01-12 20:00:19,"b""after trump cited disapproval with london's ...",1640,497,2
3,2018-02-02 21:42:18,"b'""i think this is an absolute disgrace. i thi...",2463,973,7
4,2017-03-21 23:13:57,b'@lance_evans1 may we have permission to feat...,0,0,9


In [182]:
df6.groupby(['cleaned'])[2].mean()

cleaned
0    653.043818
1    773.872960
2    796.519191
3    821.427492
4    896.323757
5    667.778654
6    951.970683
7    635.010243
8    779.994447
9    684.984694
Name: 2, dtype: float64

Finding the time setting

In [183]:
df6['time_setting'] = 0

In [184]:
df6.head()

Unnamed: 0,0,1,2,3,cleaned,time_setting
0,2017-02-08 07:00:01,"b'in 2012 interviews, steve bannon compared pr...",168,140,9,0
1,2017-01-10 03:41:30,b'sanders: we need to focus less on polls and ...,597,199,7,0
2,2018-01-12 20:00:19,"b""after trump cited disapproval with london's ...",1640,497,2,0
3,2018-02-02 21:42:18,"b'""i think this is an absolute disgrace. i thi...",2463,973,7,0
4,2017-03-21 23:13:57,b'@lance_evans1 may we have permission to feat...,0,0,9,0


In [185]:
list(df6)

[0, 1, 2, 3, 'cleaned', 'time_setting']

In [186]:
for row_no in range(0,len(df6)):
    if int(df6.iloc[row_no,0].split()[1][0:2]) in range(8,12):
        df6.iloc[row_no,5] = 'morning'
    elif int(df6.iloc[row_no,0].split()[1][0:2]) in range(12,16):
        df6.iloc[row_no,5] = 'noon'
    elif int(df6.iloc[row_no,0].split()[1][0:2]) in range(16,20):
        df6.iloc[row_no,5] = 'evening'
    elif int(df6.iloc[row_no,0].split()[1][0:2]) in range(20,24):
        df6.iloc[row_no,5] = 'night'
    else:
        df6.iloc[row_no,5] = 'late_night'

In [187]:
df6.groupby(['time_setting'])[2].mean()

time_setting
evening       876.691026
late_night    824.358207
morning       514.733969
night         728.775941
noon          913.090755
Name: 2, dtype: float64

In [188]:
df6.groupby(['time_setting'])[3].mean()

time_setting
evening       418.363783
late_night    352.810618
morning       237.887405
night         335.206525
noon          404.830003
Name: 3, dtype: float64

In [189]:
df6['time_setting'].value_counts()

night         6958
late_night    6178
evening       3722
noon          2953
morning       2620
Name: time_setting, dtype: int64

Finding if the tweet mentions the word trump

In [190]:
df6['mentions_trump'] = 0

In [191]:
for row_no in range(0,len(df6)):
    if 'trump' in tweets[row_no]:
        df6.iloc[row_no,6] = 'yes'
    else:
        df6.iloc[row_no,6] = 'no'

In [192]:
df6.groupby(['mentions_trump'])[2].mean()

mentions_trump
no     757.095317
yes    855.686278
Name: 2, dtype: float64