# Assignment 3 Visual Network Analysis of Twitter Hashtags

### Objective: 

Build an interactive app that will allow a user to input a hashtag and returns the network visualization of hashtags connections. 

In [2]:
import os
import tweepy as tw
import pandas as pd
import re
import sys
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import itertools
import networkx as nx
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly
import functions
from textblob import TextBlob, Word, Blobber
from textblob.classifiers import NaiveBayesClassifier
from textblob.taggers import NLTKTagger
from notebook_functions import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, STOPWORDS

ModuleNotFoundError: No module named 'networkx'

In [None]:
# Pandas options
pd.set_option('max_columns', 25)
pd.set_option('max_rows',10)

In [None]:
# Runs notebook with stored API keys
%run ./keys.ipynb

In [None]:
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 12, 8
rcParams['font.size'] = 20
rcParams['axes.facecolor'] = 'white'

In [None]:
# Authentification to access twitter API
api = initialize()

After importing all necessary libraries and completing the authentification process to access Twitter API, prompt for user input of a hashtag wishing to be explored. 

In [None]:
# User input
no_of_tweets = 100
query = input("Please enter keyword or hashtag to search: ")

In [None]:
node_df = pd.DataFrame(columns = ["tag","sentiment"])
edge_df = pd.DataFrame(columns = ["tag","associated_tag"])
place_df = pd.DataFrame(columns = ["tweet", "place"])
sentiments = []

Render through JSON file and save necessary information to data frame

In [None]:
# Get tweets 
public_tweets = tw.Cursor(api.search, q="{}".format(query),show_user = True,tweet_mode="extended").items(100)

# Save tweeets into data frames
for tweet in public_tweets:
    place_df = place_df.append({"tweet":tweet.full_text,"place":tweet.user.location},ignore_index = True)
    sentiments.append({'text': tweet.full_text, 'date': tweet.created_at, 'sentiment_TB': TextBlob(tweet.full_text).polarity})
    try:       
        temp_tags = []
        for i,tag in enumerate(tweet.entities.get('hashtags')):
            temp_tags.append(tag["text"])
            node_df = node_df.append({"tag":tag["text"],"sentiment":TextBlob(tweet.full_text).polarity},ignore_index=True)
        #print(temp_tags)
        res = list(itertools.combinations(temp_tags, 2))
        if res != []:
            for pair in res:
                edge_df = edge_df.append({"tag":pair[0],"associated_tag":pair[1]},ignore_index=True)
        
    except Exception as inst:
        print(type(inst))    # the exception instance
        print(inst.args)     # arguments stored in .args
        print(inst) 

In [None]:
tweets_df = pd.DataFrame.from_dict(sentiments)
tweets_df['text'] = clean_tweets(tweets_df['text'])
tweets_df

Using VADER analysis, analyze the tweets and save results to the data frame, compare with TextBlob polarity results

In [None]:
scores = []
for i in range(tweets_df['text'].shape[0]):
    compound = SentimentIntensityAnalyzer().polarity_scores(tweets_df.text[i])['compound']
    positive = SentimentIntensityAnalyzer().polarity_scores(tweets_df.text[i])['pos']
    neutral = SentimentIntensityAnalyzer().polarity_scores(tweets_df.text[i])['neu']
    negative = SentimentIntensityAnalyzer().polarity_scores(tweets_df.text[i])['neg']
    scores.append({
        'compound': compound,
        'positive': positive,
        'neutral': neutral,
        'negative': negative
    })


In [None]:
for i, row in tweets_df.iterrows():
    tweets_df.at[i, "analysis"] = analyze(row.text)   
tweets_df

In [None]:
scores_df = pd.DataFrame.from_dict(scores)
combo_df = tweets_df.join(scores_df)
combo_df

In [None]:
node_count = node_df.groupby(["tag"]).agg("count")
node_sent = node_df.groupby(["tag"]).agg("mean")
node_with_count_dict = {}
for row in node_count.iterrows():
    node_with_count_dict[row[0]] = row[1].values[0]

Analyze the sentiment for a certain hashtag

In [None]:
avg_sentiment = node_df.groupby(by='tag').mean()
avg_sentiment[avg_sentiment["sentiment"] != 0]

Find the top 5 locations twitter came from

In [None]:
place_df

In [None]:
place_df["place"].replace("",np.nan, inplace = True)
x = place_df.dropna().groupby("place").count().sort_values(by="tweet", ascending=False).head()
x

In [None]:
node_with_sent_dict = {}
for row in node_sent.iterrows():
    node_with_sent_dict[row[0]] = row[1].values[0]
edge_count = edge_df.groupby(['tag','associated_tag']).size()

In [None]:
G = nx.Graph()
for i in node_with_count_dict.keys():
    G.add_node(i, count = node_with_count_dict[i], sentiment = node_with_sent_dict[i])
for i,j in edge_df.iterrows():
    G.add_edges_from([(j["tag"],j["associated_tag"])])

In [None]:
pos = nx.spring_layout(G, k=0.1, iterations=50)
for n, p in pos.items():
    G.nodes[n]['pos'] = p

In [None]:
edge_trace = go.Scatter(
    x=[],
    y=[],
    line=dict(width=0.5,color='#888'),
    hoverinfo='none',
    mode='lines')

for edge in G.edges():
    x0, y0 = G.nodes[edge[0]]['pos']
    x1, y1 = G.nodes[edge[1]]['pos']
    edge_trace['x'] += tuple([x0, x1, None])
    edge_trace['y'] += tuple([y0, y1, None])
    
node_trace = go.Scatter(
    x=[],
    y=[],
    text=[],
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='Viridis',
        reversescale=True,
        color=[],
        size=[],
        colorbar=dict(
            thickness=10,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line=dict(width=0)))

for node in G.nodes():
    x, y = G.nodes[node]['pos']
    node_trace['x'] += tuple([x])
    node_trace['y'] += tuple([y])
    node_trace['marker']['color']+=tuple([G.nodes()[node]['sentiment']])
    node_trace['marker']['size'] += tuple([15 + G.nodes()[node]['count']])
    node_trace['text'] += tuple(['<b>' + node + ' count is {}, avg sentiment is {}</b>'.format(G.nodes()[node]['count'],G.nodes()[node]['sentiment'])])

In [None]:
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>Twitter hashtags of {}'.format(query),
                titlefont=dict(size=16),
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="Tweets",
                    showarrow=False,
                    xref="paper", yref="paper") ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

fig.show()

In [None]:
# fig = tweets_df.analysis.hist(bins=5)
# plt.title("Tweets Sentiment", alpha=0.5)
# plt.grid(False)
# plt.xlabel('Tweets', alpha=0.5)
# plt.ylabel("Percentage, %", alpha=0.5)

In [None]:
sentiments = tweets_df.analysis.unique()
plt.hist([tweets_df.loc[tweets_df.analysis == x, 'analysis'] for x in sentiments], bins=3)
plt.xlabel('Tweets', alpha=0.5)
plt.ylabel("Percentage, %", alpha=0.5)
plt.title("Tweets sentiment", alpha=0.6)
plt.legend(sentiments)

In [None]:
def word_cloud(wd_list):
    stopwords = set(STOPWORDS)
    all_words = ' '.join([text for text in wd_list])
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        width=1600,
        height=800,
        random_state=1,
        colormap='jet',
        max_words=80,
        max_font_size=200).generate(all_words)
    plt.figure(figsize=(12, 10))
    plt.axis('off')
    plt.imshow(wordcloud, interpolation="bilinear");
word_cloud(tweets_df['text'])

# Summary and Key Findings

We found the dashboard to have both strengths and limitations, in terms of being a useful analytical tool. These are summarised below.

## Strengths

* The network graph is an effective tool for displaying linkages between different hashtags and groups of hashtags. Encoding of additional features using size and color allows the user to quickly determine the significance of a hashtag within the broader context.
* Interactive features assist the user in exploring the data.
* Through analysis of the network graph and statistical displays, it is possible to determine general sentiment associated with a search topic.
* The dashboard could be used to determine which hashtags a twitter user could add to a tweet in order to increase engagement on a particular topic.
* The dashboard could be used to determine a groups support for one topic, based on their attitude and interest toward another topic. Without the network visualisation, it may not not be obvious that these two topics are somehow related. This understanding would be useful in a variety of social, business or political contexts.

## Limitations

* The average sentiment of tweets associated with a given hashtag tends to be close to 0. This could be a problem when a hashtag is polarising and theaverage of the highly poistive and highly negative sentiment is neutral, which is not an accurate representation.
* When displaying data associated with a large number of tweets (>1000 approx.), the visual tool becomes rather overwhelming and difficult to make sense of. Having the ability to filter hashtags that have appeared less than a certain number of times in a search could be a potential solution for effectively visualising larger amounts of data. 
* Some tweets included many hashtags that are related to a similar topic. Relationships between these hashtags tend to clutter teh display, rather than drawing links between disparate ideas (like we hypothesised).
* Accurate location information is not available for all tweets, limiting the effectiveness of location based analysis.