# Text Analytics | BAIS:6100
# Module 11: Keyword Network Analysis (Exercises)

Instructor: Kang-Pyo Lee 

Twitter hashtag options:
- ai
- bitcoin
- blacklivesmatter
- bts
- covid19
- fakenews
- innovation
- mentalhealth
- metoo
- startup

Choose a Twitter hashtag you're interested in and save it in the `hashtag` variable below.

In [None]:
# Your answer here
hashtag = "mentalhealth"

In [None]:
N = 500

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 150)

months = ["202012", "202011", "202010", "202009", "202008", "202007", 
          "202006", "202005", "202004", "202003", "202002", "202001"]

df = pd.DataFrame()
for month in months:
    dftmp = pd.read_csv("classdata/tweets/tweets_{}_{}.csv".format(hashtag, month), sep="\t", quoting=3)
    
    ##############################################
    # Create a random sample of N rows.
    ##############################################
    if len(dftmp) > N:
        dftmp = dftmp.sample(n=N)
    ##############################################
    
    df = pd.concat([df, dftmp])
    print("{}: {:,}".format(month, len(dftmp)))

print("Total number of tweets in df: {:,}\n".format(len(df)))

df.user_name = df.user_name.astype(str)
df.text = df.text.astype(str)

df = df.drop_duplicates(["text"])
df.index = range(len(df))

df

## Step 1: Calculate the frequencies of keywords and the co-occurrence frequencies among keywords

In [None]:
import nltk

df["words"] = df.text.apply(lambda x: nltk.word_tokenize(x))
df[["text", "words"]]

In [None]:
from nltk.corpus import stopwords
import string

global_stopwords = stopwords.words("english")
local_stopwords = [c for c in string.punctuation] +\
                  ['’', '``', '…', '...', "''", '‘', '“', '”', "'m", "'re", "'s", "'ve", 
                   'amp', 'https', "n't", 'rt', 'a…', 'co', 'i…', 't…',]

In [None]:
from collections import Counter

###################################################################################
# The 'counter' object will have all the word count information. 
# The 'co_counter' object will have all the co-occurrence count information.
###################################################################################
counter = Counter()
co_counter = dict()

for l in df.words:
    word_set = set()
    
    for item in l:
        word = item.lower()
        
        if word not in (global_stopwords + local_stopwords):
            word_set.add(word)

    counter.update(word_set)
    
    ###################################################################################
    # Calculate co-occurrence count of two words and save it in 'co_counter' 
    ###################################################################################
    words = list(word_set)
    for word1 in words:
        if word1 not in co_counter:
            co_counter[word1] = dict()
        
        for word2 in words:

            ######################################
            # Skip if the two words are the same.
            ######################################
            if word1 == word2:
                continue
            
            if word2 not in co_counter[word1]:
                co_counter[word1][word2] = 1
            else:
                co_counter[word1][word2] += 1

Check the top-30 most common words. If you have any words that you would not want to see, you can remove them by including them in the `local_stopwords` list above. 

In [None]:
counter.most_common(30)

Check the co-occurrence frequency of any two words you are interested in. 

In [None]:
co_counter["WORD1"]["WORD2"], co_counter["WORD2"]["WORD1"]

## Step 2: Create a graph object

In [None]:
import networkx as nx

G = nx.Graph()

## Step 3: Decide the number of nodes in the graph 

In [None]:
num_nodes = 30

## Step 4: Define nodes and their weights for network visualization 

You many need to adjust the node weights after actually seeing the graph. 

In [None]:
nodes = [item[0] for item in counter.most_common(num_nodes)]
node_weights = [item[1] for item in counter.most_common(num_nodes)]

## Step 5: Add nodes to the graph

In [None]:
for word in nodes:
    G.add_node(word, weight=counter.get(word))

In [None]:
G.nodes.data()     # Check what nodes there are in G

## Step 6: Add edges to the graph

In [None]:
for word1 in nodes:
    for word2 in nodes:
        if (word1 != word2) & (word2 in co_counter[word1]):
            G.add_edge(word1, word2, weight=co_counter[word1][word2])

In [None]:
G.edges.data()     # Check what edges there are in G

## Step 7: Define edges and their weights for network visualization 

In [None]:
edges = nx.get_edge_attributes(G, "weight").keys()
edges

In [None]:
edge_weights = nx.get_edge_attributes(G, "weight").values()
edge_weights

You many need to adjust the edge weights after actually seeing the graph. 

In [None]:
edge_weights = [item for item in edge_weights]
edge_weights

## Step 8: Plot the graph

Types of layouts
- circular
- random
- spectral
- spring
- shell

In [None]:
from matplotlib import pyplot as plt

You will probably see too small nodes and too thick edges. Adjust the node weights and edge weights defined above for better visualization. Also, try different types of layouts for your keyword network analysis.  

In [None]:
plt.figure(figsize=(10, 10))
nx.draw_networkx(G, pos=nx.circular_layout(G), 
                 nodelist=nodes, node_size=node_weights, edgelist=edges, width=edge_weights,
                 node_color="yellow", with_labels=True, font_size=9)
plt.draw()