# Imports and other setup

In [None]:
# data processing
import pandas as pd
import numpy as np
from numpy import genfromtxt

# graphs / figures
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import networkx as nx

# sklearn / models
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering, FeatureAgglomeration
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# nltk
import nltk
from nltk.corpus import stopwords

# others
import os
import sys
sys.path.append('../')
from embedding_functions_hugo.embedding_functions import *
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
from happytransformer import HappyTextClassification
import praw

def shorten_and_clean_dataset (comment_csv, comment_column : str, desired_comment_length : int):
    dataframe = pd.read_csv(comment_csv)
    dataframe['cleaned_text'] = prep_pipeline(dataframe, comment_column)
    dataframe['short'] = shorten_sens(dataframe['cleaned_text'], desired_comment_length)
    return dataframe

# Data Processing

### Function for getting post_title, post_author, post_id, and the embedded title texts

In [None]:
def do_post_titles(df):
    texts, authors, post_ids = [], [], []
    title_author_set = set(zip(df['post_title'], df['post_author'], df['post_id']))

    for text, author, post_id in  title_author_set:
        texts.append(text)
        authors.append(author)
        post_ids.append(post_id)
    
    title_embeddings = embed_comments(texts)
    return pd.DataFrame(data=[post_ids, authors, texts, title_embeddings]).T.rename(columns={0 : 'post_id', 1: 'post_author', 2: 'post_title', 3: 'embeddings'})

### filtering out unwanted users

In [None]:
def remove_nan(df):
    '''Takes dataframe as input, removed rows where username is nan'''

    for idx, row in df.iterrows():
        usr1 = row[2]
        usr2 = row[4]

        if type(usr1) != str or type(usr2) != str:
            df.drop(idx, axis=0, inplace=True)
    
    return df

### Getting final dataframe that we can use

In [None]:
def shorten_comment_text(df):
    '''Takes dataframe as input, shortens comment text, creates short_text column and removes comment_text column'''
    
    # getting all commenters that have also made a post
    shortened_df = df.query('comment_author in post_author').copy()

    # cleaning their comments and saving to new column
    clean_text = prep_pipeline(shortened_df, 'comment_text', loud=False)
    shortened_df['short_text'] = shorten_sens(clean_text, 50)
    shortened_df.drop('comment_text', axis=1, inplace=True)

    return shortened_df

In [None]:
def get_embed_pairs(df):
   '''Takes dataframe as input, combines all user title texts into one
      Returns embed_pairs'''
    
   # for each poster, appending all of their post text into one long string
   embed_pairs = {}
   for text, author in zip(df['post_title'], df['post_author']):
      if author not in embed_pairs.keys():
         embed_pairs[author] = text
      else:
         embed_pairs[author] += ' '+text

   return embed_pairs

def get_title_embeds(embed_pairs):
   '''Takes embed_pairs as input. Embeds the post titles and returns an array of them.'''

   # embedding all post titles and reducing them to 1 dimension
   embeds = dict(zip(embed_pairs.keys(), embed_comments(list(embed_pairs.values()))))
   embeds = reduce_to_one_dimension_kmeans(embeds)[-1]

   return embeds

In [None]:
def reduce_to_poster_commentors(df):
    '''given a df, reduces rows recursively until all commentors and posters are both commentors and posters'''

    # might be faster with doing query multiple times until no changes

    done = False
    initialized = False
    while done != True:

        comment_authors = list(df['comment_author'])
        post_authors = list(df['post_author'])

        if initialized == False:
            before = 0
            initialized = True
        else:
            before = len(users_to_include)

        users_to_include = set()

        after = 0

        for idx, row in df.iterrows():

            commentor = row[2]

            if commentor in post_authors:
                if commentor not in users_to_include:
                    # print(commentor)
                    users_to_include.add(commentor)
                    after += 1
            
            else:
                df.drop(idx, inplace=True)

        diff = after - before
        
        if diff == 0:
            print(f'before = {before} and after = {after}, so diff = {diff}, done!')
            done = True
        else:
            print(f'before = {before} and after = {after}, so diff = {diff}, relooping...')

    return df

# Creating Network

In [None]:
def create_network(df):
    '''Takes dataframe as input, creates a directed networkx network and returns it'''

    # Initializing text classification model
    model = HappyTextClassification(model_type='DISTILBERT', model_name='distilbert-base-uncased-finetuned-sst-2-english', num_labels=2)

    # initializing networkx directed graph
    G = nx.DiGraph()

    # # ensuring only looking at users that have both commented and posted
    # df = df.query('comment_author in post_author')

    print('doing embed_pairs')

    # combining users title texts into one
    embed_pairs = get_embed_pairs(df)

    print(len(embed_pairs))

    print('doing embeds')

    # embedding those combined title texts
    embeds = get_title_embeds(embed_pairs)

    print(len(embeds))

    # adding each post author as a node to network, with their 1 dimensional title embeddings as an attribute (named polarity)
    print('\n===== adding nodes ======\n')
    for i, j in zip(embed_pairs.keys(), embeds):
        print(f'added {i} with polarity {j[0]}')
        G.add_node(i, polarity=j[0])

    # getting list of user pairs for comments left on posts
    pairings = list(zip(df['comment_author'], df['post_author']))
    print(len(pairings))

    print('doing polarities')

    # classifying sentiments of comment texts and saving them in list "polarities"
    polarities = []
    signs = []
    for comment in df['short_text']:
        classification = model.classify_text(comment)
        polarities.append(classification.score)

        if classification.label == 'POSITIVE':
            signs.append(1)
        elif classification.label == 'NEGATIVE':
            signs.append(-1)
    
    print(len(polarities))

    print('\n===== adding edges =====\n')
    # adding edges to graph, where each edge is a comment left by user a to user b, and its attribute is the sentiment of the comment
    for pair, polarity, sign in zip(pairings, polarities, signs):
        if pair[0] != pair[1]: # avoiding self edge connections
            # need positive or negative for sign
            print(f'adding edge from {pair[0]} to {pair[1]} with sentiment {polarity} and label {sign}')
            G.add_edge(u_of_edge=pair[0], v_of_edge=pair[1], sentiment=polarity, sign=sign)

    # drawing network
    nx.draw(G)

    return G

In [None]:
def full_pipeline(df):
    '''does everything and returns network'''
    # doing pre-network stuff
    print('removing nan...')
    df = remove_nan(df)
    print('reducing to poster commentors...')
    df = reduce_to_poster_commentors(df)
    print('shortening comment text...')
    df = shorten_comment_text(df)

    print('creating network...')
    # creating and drawing network
    G = create_network(df)

    return G

## Creating, initially inspecting and saving subreddits

Example filepath: '../data/19march/graphs/politics_network.gexf'

Make sure the filepath exists before running (manually create them) as otherwise it will take ~12 mins to run and end up giving an error

### r/politics - takes ~11 mins with 19march politics

In [None]:
# # grabbing scraped df
# df_politics = pd.read_csv('../data/23march_chur/scrapes/politics.csv')

# G_politics = full_pipeline(df_politics)

# # saving network
# nx.write_gexf(G_politics, '../data/23march_chur/graphs/politics_nan.gexf')

### r/music - takes ~8 mins with 2april music

In [None]:
# # grabbing scraped df
# df_music = pd.read_csv('../data/2april/scrapes/2apr_2Music.csv')

# G_music = full_pipeline(df_music)

# # saving network
# nx.write_gexf(G_music, '../data/2april/graphs/music.gexf')

### r/antiwork

In [None]:
# # grabbing scraped df
# df_antiwork = pd.read_csv('../data/17april/scrapes/antiwork.csv')

# G_antiwork = full_pipeline(df_antiwork)

# # saving network
# nx.write_gexf(G_antiwork, '../data/17april/graphs/antiwork.gexf')

### r/gaming - takes ~20 mins with 4april gaming

In [None]:
# # grabbing scraped df
# df_gaming = pd.read_csv('../data/4april/scrapes/gaming.csv')

# G_gaming = full_pipeline(df_gaming)

# # saving network
# nx.write_gexf(G_gaming, '../data/4april/graphs/gaming.gexf')

# Recreating networks without rounding title embeds

In [None]:
scrape_list = [
    '../data/date_folders/april_24/scrapes/communism.csv',
    '../data/date_folders/april_23/scrapes/PoliticalDiscussion.csv',
    '../data/date_folders/april_23/scrapes/Republican.csv',
    '../data/date_folders/april_23/scrapes/democrats.csv',
    '../data/date_folders/april_23/scrapes/ukpolitics.csv',
    '../data/date_folders/april_23/scrapes/worldnews.csv',
    '../data/date_folders/april_18/scrapes/CallOfDuty.csv',
    '../data/date_folders/april_18/scrapes/FIFA.csv',
    '../data/date_folders/april_17/scrapes/antiwork.csv',
    '../data/date_folders/april_4/scrapes/gaming.csv',
    '../data/date_folders/march_23/scrapes/politics.csv',
    '../data/date_folders/march_19/scrapes/EscapefromTarkov.csv'
]

save_paths = [
    '../data/date_folders/april_24/graphs/communism.gexf',
    '../data/date_folders/april_23/graphs/PoliticalDiscussion.gexf',
    '../data/date_folders/april_23/graphs/Republican.gexf',
    '../data/date_folders/april_23/graphs/democrats.gexf',
    '../data/date_folders/april_23/graphs/ukpolitics.gexf',
    '../data/date_folders/april_23/graphs/worldnews.gexf',
    '../data/date_folders/april_18/graphs/CallOfDuty.gexf',
    '../data/date_folders/april_18/graphs/FIFA.gexf',
    '../data/date_folders/april_17/graphs/antiwork.gexf',
    '../data/date_folders/april_4/graphs/gaming.gexf',
    '../data/date_folders/march_23/graphs/politics.gexf',
    '../data/date_folders/march_19/graphs/tarkov.gexf'
]

In [None]:
for scrape_path, save_path in zip(scrape_list, save_paths):
    print(scrape_path)
    print(save_path)
    df = pd.read_csv(scrape_path)
    G = full_pipeline(df)
    nx.write_gexf(G, save_path)

In [None]:
# grabbing scraped df
df_antiwork = pd.read_csv('../data/date_folders/april_24/scrapes/communism.csv')

G_antiwork = full_pipeline(df_antiwork)

# saving network
nx.write_gexf(G_antiwork, '../data/date_folders/april_24/graphs/communism.gexf')

In [None]:
# grabbing scraped df
df_antiwork = pd.read_csv('../data/date_folders/april_18/scrapes/CallOfDuty.csv')

G_antiwork = full_pipeline(df_antiwork)

# saving network
nx.write_gexf(G_antiwork, '../data/date_folders/april_18/graphs/CallOfDuty.gexf')

In [None]:
# grabbing scraped df
df_antiwork = pd.read_csv('../data/date_folders/april_18/scrapes/FIFA.csv')

G_antiwork = full_pipeline(df_antiwork)

# saving network
nx.write_gexf(G_antiwork, '../data/date_folders/april_18/graphs/FIFA.gexf')