In [3]:
# import required libraries

import tweepy as tw
import pandas as pd
import numpy as np
np.random.seed(2018)

%matplotlib inline
import matplotlib.pyplot as plt; plt.rcdefaults()

import warnings
warnings.filterwarnings('ignore')

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('wordnet')

import re
import functools


# Install Langdetect before running this code. Restart the kernel and run it again
# !pip install langdetect

from langdetect import DetectorFactory
DetectorFactory.seed = 0
from langdetect import detect

# ------------------------------------------- Tweets and LDA modelling -----------------------------------------------

# Variables that contain the user credential to access tweeter API
CONSUMER_API_KEY = "HYAbRLjJ93fhpvy49TbNgnrnD"
CONSUMER_API_SECRET = "bZYZtiilbGr1j9KDsANAgUNZDy5yEwsUTZ6ov6hgrP4FWGaito"
BEARER_TOKEN = "AAAAAAAAAAAAAAAAAAAAAKHZLwEAAAAAiAXQGpfRv0XPJb8Z1YinwYAMYqM%3DjtgD57jRa9xyZRFtHZDeIyIiPHw138GgUhtHc6ggXXzJI9KuMN"
ACCESS_TOKEN = "2248072008-EoFAoy6bLmzORoSV5sJY609yzBLzbFVB5X2u0t1"
ACCESS_TOKEN_SECRET = "kd27S7QPHB7sTYTewR2Hd2eRVurjBPPb7KMffJm375KR3"

# create an authentication object
auth = tw.OAuthHandler(CONSUMER_API_KEY, CONSUMER_API_SECRET)

# Setting your access token and secret
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

# Create the API object passing the auth object
api = tw.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)


def get_tweets(user_id):

    tweets = api.user_timeline(screen_name=user_id, 
                               # 200 is the maximum allowed count
                               count=200,
                               include_rts = False,
                               tweet_mode = "extended"
                               )
    

    text_list = []
    id_list = []
    date_list = []
    for tweet in tweets:
        # Keep only English tweets
        language = detect(tweet.full_text)
        if language == 'en':
            tw_text = tweet.full_text
            tw_id = tweet.id
            tw_date = tweet.created_at
            text_list.append(tw_text)
            id_list.append(tw_id)
            date_list.append(tw_date)

        else:
            continue
    
    print('\n{} Tweets collected from @{}...\n'.format(len(text_list), user_id))
    df = pd.DataFrame({'id': id_list, 'date': date_list, 'text':text_list})
    df.drop_duplicates(subset='text',inplace=True)

    
    
    # ------------------------------------------- Data cleaning -----------------------------------------------
    
    # removing stopwords
    additional  = ['rt','rts','retweet']
    swords = set().union(stopwords.words('english'),additional)

    # remove punctuations, remove numbers in tweets, and remove words with length less than 2
    df['cleaned_text'] = df['text'].str.lower()\
                          .str.replace('^@',' ')\
                          .str.replace('(http\S+)', ' ')\
                          .str.replace('([^0-9a-z \t])',' ')\
                          .str.replace(' +,',' ')\
                          .str.replace(r'\b\w{,3}\b',' ')\
                          .str.replace(r'\b\d+\b',' ')\
                          .apply(lambda x: [i for i in x.split() if not i in swords]).apply(lambda x: ' '.join(x))
    df.drop_duplicates(subset='cleaned_text', inplace=True)
    df.reset_index(inplace=True)
    del df['index']
    
    
    # ------------------------------------------- Data pre-processing -----------------------------------------------
    # Lemmatizing text
    stemmer = SnowballStemmer('english')
    def lemmatize_stemming(text):
        return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

    def preprocess(text):
        result = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                result.append(lemmatize_stemming(token))
        return result


    processed_docs = df['cleaned_text'].map(preprocess)
    dictionary = gensim.corpora.Dictionary(processed_docs)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]
    
    print('Creating LDA models...\n')
    
    # Creating LDA Models
    
    number_of_topics = 10
    # Bag of words
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=number_of_topics, 
                                           id2word=dictionary, passes=2, workers=2)
    
    # TFIDF
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=number_of_topics, 
                                                 id2word=dictionary, passes=2, workers=4)
    
    
    # 10 topics for bag of words (obs: not used in this project)
    bow_topic_list = []
    number_of_topics = 10
    for i in range(number_of_topics):
        items_list = lda_model.print_topics(-1)[i][1].split(' + ')
        weight_word_list = []
        for item in items_list:
            weight_pattern = '^[0-9].[0-9]+'
            word_pattern = '[a-z]+'

            weight_list = re.findall(weight_pattern, item)
            word_list = re.findall(word_pattern, item)

            weight = weight_list[0]
            word = word_list[0]

            weight_word_list.append((weight, word))

        bow_topic_list.append((i, weight_word_list))


    # 10 topics for TFIDF
    tfidf_topic_list = []
    number_of_topics = 10
    for i in range(number_of_topics):
        items_list = lda_model_tfidf.print_topics(-1)[i][1].split(' + ')
        weight_word_list = []
        for item in items_list:
            weight_pattern = '^[0-9].[0-9]+'
            word_pattern = '[a-z]+'

            weight_list = re.findall(weight_pattern, item)
            word_list = re.findall(word_pattern, item)

            weight = weight_list[0]
            word = word_list[0]

            weight_word_list.append((weight, word))

        tfidf_topic_list.append((i, weight_word_list))
    
    return df, bow_topic_list, tfidf_topic_list


# ------------------------------------------- Layout -----------------------------------------------

from ipywidgets.widgets import Label, FloatProgress, FloatSlider, Button, Text, Tab, Output
from ipywidgets.widgets import Layout, HBox, VBox
from IPython.display import display, clear_output
import time

def get_lda(df, bow_topic_list, tfidf_topic_list):
    
    button_style = 'warning'
    layout_width = '100px'
    custom_color = [
                    ['Navy', 'DarkBlue', 'MediumBlue', 'Blue', 'RoyalBlue',
                    'DodgerBlue', 'DeepSkyBlue', 'SkyBlue', 'PowderBlue', 'LightBlue' ],
                    ['Maroon', '#C62D42', '#CC3336', '#D92121', '#ED0A3F',
                    '#FD0E35', '#D92121', '#FF5349', '#FE4C40', '#FE6F5E' ],
                    ['#E77200', '#FF681F', '#FF7F49', '#FF8833', '#FFAE42',
                    '#F2BA49', '#F2C649', '#F8D568', '#FCD667', '#FED85D' ],
                    ['#00755E', '#5E8C31', '#4D8C57', '#7BA05B', '#5FA777',
                    '#3AA655', '#63B76C', '#00CC99', '#33CC99', '#93DFB8' ],
                    ['#3F26BF', '#6456B7', '#766EC8', '#7070CC', '#ACACE6',
                    '#9999CC', '#8C90C8', '#9999CC', '#9999CC', '#9999CC' ],        
                    ['#A50B5E', '#E30B5C', '#BB3385', '#DA3287', '#C8509B',
                    '#FC80A5', '#F091A9', '#FF91A4', '#FFA6C9', '#FFB7D5' ]
                    ]
    
        
    # Printing Tweets according to the word selected in a topic
    def return_tweets(topic_index, word_index):
        
        word_list = [tfidf_topic_list[topic_index][1][i][1] for i in range(10)]
        print('Word: {}\n'.format(word_list[word_index]))
        for i in range(len(df['cleaned_text'])):
            word_search = word_list[word_index]
            if word_search in df['cleaned_text'][i]:
                tw_id = df['id'][i]
                tw_date = df['date'][i]
                tw_text = df['text'][i]
                print("ID: {}".format(tw_id))
                print('Date: {}'.format(tw_date))
                print('Tweet: \n {}\n'.format(tw_text))
    
    
    def restart():
        clear_output()
        display(user_id_input)
        display(start_bt)
        display(box)
        display(clear_out)
    
    def button_top(b, topic_index, word_index):
        global tab
        restart()
        return_tweets(topic_index, word_index)

    topics = [i for i in range(10)]
    
    # Creating buttons for all words in all topics
    topics_list = []
    for topic in topics:
        
        word_list = [tfidf_topic_list[topic][1][i][1] for i in range(10)]
        buttons_list = []
        
        temp_description = Label('Topic {}'.format(topic+1), layout=Layout(width='50px', height='20px'))
        buttons_list.append(temp_description)
        
        for i in range(len(word_list)):
            word_item = word_list[i]
            button = Button(description=word_item, button_style=button_style, layout=Layout(width=layout_width))
            if topic == 6:
                color = custom_color[0][i]
            elif topic == 7:
                color = custom_color[1][i]
            elif topic == 8:
                color = custom_color[2][i]
            elif topic == 9:
                color = custom_color[3][i]
            else:
                color = custom_color[topic][i]
            button.style.button_color = color
            button.on_click(functools.partial(button_top, topic_index=topic, word_index=i))
            buttons_list.append(button)
            
        topics_list.append(buttons_list)

    
    hbox_list = []
    for topic in topics_list:
        hbox_temp = HBox(children=(topic), layout=Layout(margin='0 0 0 43px'))
        hbox_list.append(hbox_temp)

    box = VBox(
        children=(hbox_list),
        layout=Layout(border='solid 3px gray', width='1200px'),  
        )
    
    return box


button_style='warning'
layout_width = '150px'

user_id_input = Text()
description = Label('Fill in the box with a valid Twitter user id and hit the "Start" button.', layout=Layout(width='500px',height='20px'))

start_bt = Button(description = 'Start', button_style=button_style, layout=Layout(width=layout_width))
start_bt.style.button_color = 'blue'


clear_out = Button(description = ' Clear / Restart ', button_style=button_style, layout=Layout(width=layout_width))
clear_out.style.button_color = 'blue'



def clear(b):
    start_process()

def start(b):
    global user_id_input
    clear_output()
    display(user_id_input)
    display(start_bt)
    user_id = user_id_input.value
    if user_id == '':
        print('Please, provide an valid user ID.')
        time.sleep(2)
        start_process()
    else:
        print('Starting process...\n')
        df, bow_topic_list, tfidf_topic_list = get_tweets(user_id)
        box = get_lda(df, bow_topic_list, tfidf_topic_list)
        print('Process finished!\n')
        time.sleep(2)
        clear_output()
        display(description)
        display(user_id_input)
        display(start_bt)
    
        # -------------- Graphs plotting ----------------------
        count_list = []
        for i in range(len(tfidf_topic_list)):
            topic = tfidf_topic_list[i][1]
            count = 0
            for j in range(len(topic)):
                value = float(topic[j][0])
                value = round(value, 4)
                count += value
            count_list.append(round(count, 3))

        topics_graph = ['Topic {}'.format(i+1) for i in range(10)]
        df = pd.DataFrame({'topics':topics_graph, 'count': count_list})    
        df.sort_values(by='count', ascending=False, inplace=True)
        
        top5_topics = df['topics'][:5].values
        top5_count = df['count'][:5].values
        y_pos = np.arange(len(top5_topics))
        
        all_topics = df['topics'].values
        all_count = df['count'].values
        all_y_pos = np.arange(len(all_topics))
        
        out1 = Output()
        out2 = Output()        
        
        tab = Tab(children = [out1, out2])
        tab.set_title(0, 'Top-5 topics')
        tab.set_title(1, 'All topics')
        display(tab)
        
        from matplotlib.pyplot import figure
        fig = figure(figsize=(5, 4)) 
        
        with out1:
            plt.bar(y_pos, top5_count, align='center', alpha=0.5)
            plt.xticks(y_pos, tuple(top5_topics))
            plt.xlabel('Topics')
            plt.ylabel('Weighted sum')
            plt.title('Top-5 Topics')
            plt.show()
        
        with out2:
            plt.bar(all_y_pos, all_count, align='center', alpha=0.5)
            plt.xticks(all_y_pos, tuple(all_topics))
            plt.xlabel('Topics')
            plt.xticks(rotation=45) 
            plt.ylabel('Weighted sum')
            plt.title('All topics')
            plt.show()
        
        
        display(box)
        display(clear_out)

def start_process():
    clear_output()
    display(description)
    display(user_id_input)
    display(start_bt)
    display(clear_out)

start_bt.on_click(start)
clear_out.on_click(clear)

start_process()

Text(value='nytimes')



VBox(children=(HBox(children=(Label(value='Topic 1', layout=Layout(height='20px', width='50px')), Button(butto…



Word: women

ID: 1423290262609031179
Date: 2021-08-05 14:30:16
Tweet: 
 The 50-kilometer racewalking event will stride off for the final time at an Olympics in Tokyo.

Officials decided that the event, the only one without an approximate equivalent for women, does not fit with the IOC's mission of gender equality.
https://t.co/c93pym9MS9

ID: 1423287696705736704
Date: 2021-08-05 14:20:04
Tweet: 
 Nevin Harrison was the only American canoe or kayak sprinter to qualify for #Tokyo2020. She won a gold medal in the women’s canoe single 200 meters.

“I didn’t have anyone to follow,” she said. “I hope to be that person for the next generation.”
https://t.co/56SeSbujoJ

ID: 1423247420121620481
Date: 2021-08-05 11:40:02
Tweet: 
 A report detailing Gov. Andrew Cuomo’s sexual harassment of 11 women was worse than what people around him — and his critics — had expected.

What happens next? Listen to today’s episode of The Daily. https://t.co/lNTtZy93cB

ID: 1423238623579750400
Date: 2021-08-05 11: