In [None]:
import networkx as nx
import pandas as pd
import plotly.express as px
import plotly.offline as py

from itertools import combinations
from os import mkdir
from os.path import exists
from re import findall
from string import punctuation
from time import time

py.init_notebook_mode(connected=True)

t0 = time()

### Cleanup functions

In [None]:
def remove_punct(word):
    ''' Remove punctuation from word. '''
    for punct in punctuation:
        word = word.strip(punct)
    return word

def is_number(word):
    ''' Return True if word is a number. '''
    try:
        float(word)
    except:
        return False
    return True

def find_mentions(sentence):
    ''' Return list of mentions from string. '''
    mentions = {}
    chunks = sentence.split()
    
    for i, word in enumerate(chunks):
        if len(word) > 1\
        and not is_number(word)\
        and word in (word.capitalize(), word.upper()):
            if (i-1) in mentions:
                mentions[i] = ' '.join([mentions.pop(i-1), word])
            else:
                mentions[i] = word
    
    return list(mentions.values())

### Load data

Build a `Pandas.DataFrame` object from desired `columns` only (smaller memory footprint).

In [None]:
columns = ['Angry', 'Care', 'Comments', 'Description',
           'Haha', 'Likes', 'Link Text', 'Link', 'Love',
           'Message', 'Page Name', 'Sad', 'Shares',
           'Total Interactions', 'Wow']

df = pd.read_csv('2021-01-11-09-44-34-BRT-search-csv-export.csv',
                 usecols=columns)

print(f'Loaded {df.shape[0]} rows and {df.shape[1]} columns.')

### Page stats

In [None]:
if not exists('RESULTS'):
    mkdir('RESULTS')

In [None]:
stats = df[['Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry', 'Care']]

with pd.option_context('mode.chained_assignment', None):
    stats['Total'] = stats.sum(axis=1)

stats.index = df['Page Name']
stats.to_csv('RESULTS/stats.csv')

stats.sort_values('Total', ascending=False).head(25) # <-- top 25

In [None]:
stats.describe()

### Build graphs

Prepare edge list `E` and build directed graphs `G{1,2,3}` using `NetworkX.DiGraph`.

#### Graph from links

In [None]:
E1 = df[['Page Name', 'Link']]
E1.to_csv('RESULTS/E1_Links.csv')
E1.head()

In [None]:
G1 = nx.DiGraph()
G1.add_edges_from(E1.values)

nx.set_node_attributes(G1, stats)
nx.write_gml(G1, 'RESULTS/G1_Links.gml')

print(f'Graph from links has {G1.order()} nodes and {G1.size()} edges.')

#### Graph from mentions in link text

In [None]:
E2 = df[['Page Name', 'Link Text']]

with pd.option_context('mode.chained_assignment', None):
    E2['Mentions'] = E2['Link Text'].apply(lambda x: find_mentions(x) if isinstance(x, str) else None)
    E2['Mentions'] = [None if x and x[0].split()[0] == 'Timeline' else x for x in E2['Mentions']]
    E2['Mentions'] = [None if x == ['This'] else x for x in E2['Mentions']]
    E2.dropna(inplace=True, subset=['Mentions'])
    E2.to_csv('RESULTS/E2_Mentions.csv')

E2.head()

In [None]:
G2 = nx.DiGraph()
G2.add_edges_from(E2[['Page Name', 'Mentions']].explode('Mentions').dropna().values)

nx.set_node_attributes(G2, stats)
nx.write_gml(G2, 'RESULTS/G2_Mentions.gml')

print(f'Graph from mentions has {G2.order()} nodes and {G2.size()} edges.')
pd.DataFrame(nx.degree(G2)).sort_values(1, ascending=False).head(25) # <-- top 25

#### Graph from hashtags in message and description

In [None]:
E3 = df[['Page Name', 'Message', 'Description']]

with pd.option_context('mode.chained_assignment', None):
    E3['text'] = [str(x) for x in E3.values]
    E3['hashtags'] = [findall(r'\s([#][\w_-]+)', str(x)) for x in E3['text']]
    E3['hashtags'] = [None if x == [] else x for x in E3['hashtags'].values]
    E3.dropna(inplace=True, subset=['hashtags'])
    E3.to_csv('RESULTS/E3_Hashtags.csv')

E3.head()

In [None]:
G3 = nx.DiGraph()
G3.add_edges_from(E3[['Page Name', 'hashtags']].explode('hashtags').values)

nx.set_node_attributes(G3, stats)
nx.write_gml(G3, 'RESULTS/G3_Hashtags.gml')

print(f'Graph from hashtags has {G3.order()} nodes and {G3.size()} edges.')
pd.DataFrame(nx.degree(G3)).sort_values(1, ascending=False).head(25) # <-- top 25

In [None]:
G4 = nx.DiGraph()

for hashtags in E3['hashtags'].values.tolist():
    G4.add_edges_from(combinations(hashtags, 2))

nx.set_node_attributes(G4, stats)
nx.write_gml(G4, 'RESULTS/G4_Hashtags-Hashtags.gml')

print(f'Graph from concurrent hashtags has {G4.order()} nodes and {G4.size()} edges.')
pd.DataFrame(nx.degree(G4)).sort_values(1, ascending=False).head(25) # <-- top 25

___

In [None]:
print(f'done in {time()-t0:.3f}s')