In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from collections import Counter
from statistics import mean
import math

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

from detoxify import Detoxify

In [2]:
nltk.download('vader_lexicon') # get lexicons data
nltk.download('punkt') # for tokenizer
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Melissa\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Melissa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Melissa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Helpers

In [3]:
def analyze(df):
    # Time range of posts:
    times = [int(x) for x in df['created_utc'].tolist()]
    bins = range(min(times)-1, max(times)+1, 86400)
    counts, bins = np.histogram(times, bins)
    plt.figure()
    plt.title(f"UTC chunked by day from {datetime.fromtimestamp(min(times))} to {datetime.fromtimestamp(max(times))}")
    plt.stairs(counts, bins)
    print(f"Ranking most common subreddit: {df['subreddit'].value_counts()}")
    print(f"Ranking most common user: {df['author'].value_counts()}")
    

In [4]:
def cleanup(df):
    print(f"Original df size: {df.shape}")
    # Remove rows that don't make sense (eg. UTC not an integer)
    df['created_utc'] = pd.to_numeric(df['created_utc'], errors='coerce')
    df = df.dropna(subset=['created_utc'])
    print(f"After removing messed up rows: {df.shape}")
    # Removes rows that are too old (before 2020)
    # df['created_utc'] = df['created_utc'].astype('int')
    # df = df[df['created_utc'] > datetime(2020, 1, 1).timestamp()]
    # Assign post ID's to link_id, remove the starting 't3_'
    df.loc[df['type'] == 'post', 'link_id'] = df.loc[df['type'] == 'post', 'id']
    df['link_id'] = df['link_id'].str.replace('t3_', '')
    df['parent_id'] = df['parent_id'].str.replace('t3_', '')
    # Removing known bots
    authors_to_remove = ["AutoModerator", "#NAME?"]
    df = df[~df['author'].isin(authors_to_remove)]
    print(f"After removing known bots df size: {df.shape}")
    # Shrink set to desired features. Drop na author/subreddit/id/link_id
    df['num_comments'].fillna(0, inplace=True)
    df = df[['created_utc', 'type', 'author', 'subreddit', 'id', 'link_id', 'score', 'num_comments', 'upvote_ratio', 'neg', 'neu', 'pos', 'compound']].dropna(subset=['author', 'subreddit', 'id', 'link_id'])
    df = df.drop_duplicates(subset=["author", "id"]).reset_index(drop=True)
    print(f"After drop dup author/link id and NaN df size: {df.shape}")
    return df

In [5]:
def filter_min_commenters(df, x=10):
    # Drop duplicate ID's: each post/comment should have unique id:
    df = df.drop_duplicates(subset=['id'])
    # Filter out min commenters
    print(f"Before filtering out authors less than {x} comments: {df.shape}")
    filter_comment_authors = df["type"] != "comment"
    value_counts = df['author'].value_counts()
    mask = df['author'].map(value_counts) > x
    df = df[mask | filter_comment_authors]
    print(f"After filtering out authors less than {x} comments: {df.shape}")
    return df

In [6]:
def most_common(lst):
    counter = Counter(lst)
    return counter.most_common(1)[0][0] if counter else None

In [7]:
# Define a function to filter None and empty strings from a list
def filter_list(lst):
    return ' '.join([item for item in lst if (item is not None) and (item != '') and not (isinstance(item, float) and math.isnan(item))])

In [8]:
def run_sentiment_analysis(df):
    sia = SIA()
    cols = ['title', 'selftext', 'body']
    df[cols] = df[cols].fillna('')
    print("Concat content..")
    df["selftext_title_body"] = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    print("Obtaining polarity scores")
    res = [*df['selftext_title_body'].apply(sia.polarity_scores)]
    print("Creating new dataframe...")
    sentiment_df = pd.DataFrame.from_records(res)
    df = pd.concat([df, sentiment_df], axis=1, join='inner')
    return df

In [14]:
def run_toxic_analysis(df):
    pred = Detoxify('multilingual')
    # pred = Detoxify('unbiased')
    cols = ['title', 'selftext', 'body']
    df[cols] = df[cols].fillna('')
    print("Concat content..")
    df["selftext_title_body"] = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    users = pd.read_csv("all_users_check_susp.csv", index_col=0)
    filt_df = df[df['author'].isin(users['User'])].reset_index(drop=True)
    print(f"Obtaining toxicity scores on size {filt_df.shape}")
    res = [*filt_df['selftext_title_body'].apply(pred.predict)]
    print("Creating new dataframe...")
    sentiment_df = pd.DataFrame.from_records(res).reset_index(drop=True)
    filt_df = pd.concat([filt_df, sentiment_df], axis=1, join='inner').reset_index(drop=True)
    df = pd.merge(df, filt_df[['id', 'toxicity']], on='id', how='left').drop_duplicates()
    return df#, sentiment_df, filt_df

In [70]:
import Levenshtein

# print(Levenshtein.jaro_winkler("This is the best video ever", "It's the best video"))
def dist(x, all_comments):
    return mean([Levenshtein.jaro_winkler(x, y) for y in all_comments])
    

def find_comment_similarity(df):
    users = pd.read_csv("all_users_check_susp_parrot.csv", index_col=0)
    df_comments = df[df['type'] == 'comment'][['author', 'body']].reset_index(drop=True)
    df_comments = df_comments[df_comments['author'] != 'AutoModerator']
    all_comments = df_comments['body'].tolist()
    df_comments_rel_users = df_comments[df_comments['author'].isin(users['User'])].reset_index(drop=True)
    print(f"Calculating similarity on size {df_comments_rel_users.shape}")
    df_comments_rel_users['jw_sim'] = df_comments_rel_users['body'].apply(lambda x: dist(x, all_comments))
    return df_comments_rel_users[['author', 'jw_sim']].groupby(['author']).mean().reset_index()

df = find_comment_similarity(pd.read_csv(os.path.join("data_scraping", "political_subreddits_new.csv"))[:10000])
df

Calculating similarity on size (2855, 2)


Unnamed: 0,author,jw_sim
0,-Quothe-,0.508074
1,0U8124X,0.538679
2,120GoHogs120,0.592851
3,1984rip,0.597691
4,1bir,0.543934
...,...,...
1134,zihuatapulco,0.595900
1135,zombi-roboto,0.603211
1136,zoot_boy,0.521284
1137,zubaz69,0.574121


# Network Functions

In [9]:
sr_dir = os.path.join('data_scraping', 'subreddits')
os.listdir(sr_dir)

['get_subreddit_info.ipynb',
 'political_center.csv',
 'political_communism.csv',
 'political_left.csv',
 'political_libertarian.csv',
 'political_other.csv',
 'political_right.csv',
 'popular_by_post_votes.csv',
 'popular_by_subscribers.csv',
 'save',
 'sus_usrs_subreddits.csv']

In [10]:
# Create usr_df, sr_df that has corresponding node attributes
# - usr_df['usr', 'top_sr', 'sr_set', 'num_sr', 'num_posts', 'num_comments', 'num_popular_sr', 'num_political_sr', 'num_misc_sr', 'ratio_popular_sr', 'ratio_political_sr', 'ratio_misc_sr', 'avg_upvote_ratio', 'content_list']
# - sr_df = the csv scraped
def get_node_attr(df, no_set=True):
    ### Put together sr_df
    print("=== Making SR DF ===")
    sr_dir = os.path.join('data_scraping', 'subreddits')
    political_df_list = []
    popular_df_list = []
    misc_df_list = []
    for file in os.listdir(sr_dir):
        if file.endswith(".csv"):
            print(f"Parsing {file}")
            cur_df = pd.read_csv(os.path.join(sr_dir, file))
            if 'political' in file:
                political_df_list.append(cur_df)
            elif 'popular' in file:
                popular_df_list.append(cur_df)
            else:
                misc_df_list.append(cur_df)
    political_df = pd.concat(political_df_list)
    political_df['topic_type'] = 'political'
    popular_df = pd.concat(popular_df_list)
    popular_df['topic_type'] = 'popular'
    misc_df = pd.concat(misc_df_list)
    misc_df['topic_type'] = 'misc'
    sr_df = pd.concat([political_df, popular_df, misc_df]).drop_duplicates(subset='Subreddit').reset_index(drop=True)[['Subreddit', 'Title', 'Public Description', 'Description', 'topic_type']]
    sr_df['node_type'] = 'subreddit'
    
    ### Put together usr_df
    print("=== Setting up for USR DF ===")
    df = df[['type', 'author', 'subreddit', 'upvote_ratio', 'neg', 'neu', 'pos', 'compound']]
    df = df.groupby(['author']).agg(list).reset_index()
    df['subreddit_set'] = df['subreddit'].apply(lambda x: set(x))
    df['popular_sr'] = [set(sr_df[sr_df['topic_type'] == 'popular']['Subreddit']).intersection(x) for x in df['subreddit_set']]
    df['political_sr'] = [set(sr_df[sr_df['topic_type'] == 'political']['Subreddit']).intersection(x) for x in df['subreddit_set']]
    df['misc_sr'] = [set(sr_df[sr_df['topic_type'] == 'misc']['Subreddit']).intersection(x) for x in df['subreddit_set']]
    print("=== Making USR DF ===")
    usr_df = df[['author']].rename(columns={'author': 'usr'}).drop_duplicates(subset='usr').reset_index(drop=True)
    usr_df['top_sr'] = df['subreddit'].apply(most_common)
    if no_set:
        usr_df['sr_set'] = df['subreddit_set'].apply(lambda x: ' '.join(x))
    else:
        usr_df['sr_set'] = df['subreddit_set']
    print("Counting...")
    usr_df['num_sr'] = df['subreddit_set'].apply(lambda x: len(x))
    usr_df['num_posts'] = df['type'].apply(lambda x: x.count('post'))
    usr_df['num_comments'] = df['type'].apply(lambda x: x.count('comment'))
    usr_df['num_popular_sr'] = df['popular_sr'].apply(lambda x: len(x))
    usr_df['num_political_sr'] = df['political_sr'].apply(lambda x: len(x))
    usr_df['num_misc_sr'] = df['misc_sr'].apply(lambda x: len(x))
    print("Adding ratios...")
    usr_df['ratio_popular_sr'] = usr_df['num_popular_sr'] / usr_df['num_sr']
    usr_df['ratio_political_sr'] = usr_df['num_political_sr'] / usr_df['num_sr']
    usr_df['ratio_misc_sr'] = usr_df['num_misc_sr'] / usr_df['num_sr']
    usr_df['avg_upvote_ratio'] = df['upvote_ratio'].apply(lambda x: np.nanmean(x))
    print("Adding sentiment analysis...")
    usr_df['avg_neg'] = df['neg'].apply(lambda x: np.nanmean(x))
    usr_df['avg_neu'] = df['neu'].apply(lambda x: np.nanmean(x))
    usr_df['avg_pos'] = df['pos'].apply(lambda x: np.nanmean(x))
    usr_df['avg_compound'] = df['compound'].apply(lambda x: np.nanmean(x))
    # usr_df['content_dump'] = df['selftext'] + df['title'] + df['body']
    # if no_set:
    #     usr_df['content_dump'] = usr_df['content_dump'].apply(filter_list)
    usr_df['node_type'] = 'user'

    return sr_df, usr_df, df

In [11]:
def label_sus_usr(df):
    sus_usrs_df = pd.read_csv("sus_users.csv", index_col=False)
    df["is_sus"] = df['usr'].isin(sus_usrs_df['name'])
    return df

In [12]:
# Nodes: subreddits, users
# edges: posters connected to subreddits, commenters connected to posters
# return: df['node_x'(subreddit/poster), 'node_y'(poster/commenter), edge_attr='weight']
def make_bipartite_graph(df):
    ### First, connect subreddits to posters
    df_1 = df[df['type'] == 'post']
    df_1 = df_1[['subreddit', 'author']]
    df_1['weight'] = df_1.groupby(['subreddit', 'author'])['subreddit'].transform('count')
    df_1 = df_1.drop_duplicates(subset=['subreddit', 'author']).reset_index(drop=True)
    df_1.rename(columns={'subreddit': 'node_x', 'author': 'node_y'}, inplace=True)

    ### Second, connect posters to commenters:
    df = df[['type', 'subreddit', 'author', 'id', 'link_id']].drop_duplicates(subset=['id']) # make sure we don't have multiple of same ID (each comment/post should have unique)
    df = df.merge(df,  left_on='id', right_on='link_id', how='inner')
    df['id'] = df['id_x']
    df['subreddit'] = df['subreddit_x']
    df = df[['author_x', 'id', 'subreddit', 'author_y']]
    df = df.drop_duplicates(subset=["author_x", "author_y", 'id']).reset_index()
    # No usr to self pairing
    df = df[df['author_x'] != df['author_y']]
    # Sort pairs so we can squash
    df.loc[df['author_x'] > df['author_y'], ['author_x', 'author_y']] = df.loc[df['author_x'] > df['author_y'], ['author_y', 'author_x']]
    # Squash all same usr-usr pairings link_ids into list
    df = df.groupby(['author_x', 'author_y']).agg(list).reset_index()
    # df.to_csv("check0.csv")
    df['weight'] = df['id'].str.len()
    # Do we want?
    # df['top_subreddit'] = df['subreddit'].apply(most_common)
    # Sort by weight, only keep necessary columns
    df = df[['author_x', 'author_y', 'weight']] #, 'top_subreddit']]
    df.rename(columns={'author_x': 'node_x', 'author_y': 'node_y'}, inplace=True)

    print(f"Size of subreddit-poster: {df_1.shape}")
    print(f"Size of poster-commenter: {df.shape}")
    return pd.concat([df_1, df]).sort_values(by='weight', ascending=False).reset_index(drop=True)

In [68]:
def add_node_attr(sr_df, usr_df, G):
    attrs = ['node_type',
    'usr',
    'top_sr',
    'sr_set',
    'num_sr',
    'num_posts',
    'num_comments',
    'num_popular_sr',
    'num_political_sr',
    'num_misc_sr',
    'ratio_popular_sr',
    'ratio_political_sr',
    'ratio_misc_sr',
    'avg_upvote_ratio',
    'avg_neg',
    'avg_neu',
    'avg_pos',
    'avg_compound',
    'is_sus',
    'Subreddit',
    'Title',
    'Public Description',
    'Description',
    'topic_type']
    attrs_dict =  dict.fromkeys(attrs, 0)
    attrs_dict['node_type'] = 1
    print("Setting subreddit attributes...")
    sr_df = sr_df.set_index('Subreddit')
    nx.set_node_attributes(G, sr_df.to_dict('index'))
    print("Setting user attributes...")
    usr_df = usr_df.set_index('usr')
    nx.set_node_attributes(G, usr_df.to_dict('index'))
    print("Setting default 0 values for nodes that didn't have attributes, marking node_type as subreddit...")
    for node in G.nodes(data=True):
        if len(node[1]) == 0:
            G.add_nodes_from([(node[0], attrs_dict)])
    return G

# Make Network(s)

In [45]:
data_path = "data_scraping\\data_1month" # Change this to path where .csv are located
out_path = "output"
mega_df = pd.DataFrame()
combo_category_df = {
    "political": pd.DataFrame(),
    "popular": pd.DataFrame(),
    "sus_usr": pd.DataFrame(),
}

## Update mega attributes

### Don't run again

In [16]:
# Just get all nodes info, no need to make graph
# Separate per csv
for fn in os.listdir(data_path):
    if fn.endswith(".csv"):
        name = fn.split('.')[0]
        type_sr = fn.split('_subreddits')[0]
        df = pd.read_csv(os.path.join(data_path, fn))
        print(f"=== Parsing {fn} {df.shape} ===")
        df_clean = cleanup(df)
        # analyze(df_clean)
        mega_df = pd.concat([mega_df, df_clean])
        # combo_category_df[type_sr] = pd.concat([combo_category_df[type_sr], df_clean])


=== Parsing political_subreddits_new.csv (282927, 22) ===
Original df size: (282927, 22)
After removing messed up rows: (282927, 22)
After removing known bots df size: (278127, 22)
After drop dup author/link id and NaN df size: (269620, 13)
=== Parsing popular_subreddits_new.csv (375710, 22) ===
Original df size: (375710, 22)
After removing messed up rows: (375710, 22)
After removing known bots df size: (374114, 22)
After drop dup author/link id and NaN df size: (366073, 13)
=== Parsing sus_usr_subreddits_0_new.csv (687008, 22) ===
Original df size: (687008, 22)
After removing messed up rows: (687008, 22)
After removing known bots df size: (679397, 22)
After drop dup author/link id and NaN df size: (664478, 13)
=== Parsing sus_usr_subreddits_10_new.csv (49622, 22) ===
Original df size: (49622, 22)
After removing messed up rows: (49622, 22)
After removing known bots df size: (48988, 22)
After drop dup author/link id and NaN df size: (48150, 13)
=== Parsing sus_usr_subreddits_1_new.csv (

In [17]:
sr_df, usr_df, df = get_node_attr(mega_df, no_set=False)
print("Saving node attr to csv")
sr_df.to_csv("sr_node_attr.csv")
usr_df.to_csv("usr_node_attr.csv")
df.to_csv("df_attr.csv")

=== Making SR DF ===
Parsing political_center.csv
Parsing political_communism.csv
Parsing political_left.csv
Parsing political_libertarian.csv
Parsing political_other.csv
Parsing political_right.csv
Parsing popular_by_post_votes.csv
Parsing popular_by_subscribers.csv
Parsing sus_usrs_subreddits.csv
=== Setting up for USR DF ===


KeyboardInterrupt: 

In [15]:
usr_df = pd.read_csv("usr_node_attr.csv")
usr_df = label_sus_usr(usr_df)
usr_df.to_csv("usr_node_attr.csv")
usr_df.drop(labels=["Unnamed: 0"], axis=1, inplace=True)
usr_df

Unnamed: 0,Unnamed: 0.1,usr,top_sr,sr_set,num_sr,num_posts,num_comments,num_popular_sr,num_political_sr,num_misc_sr,ratio_popular_sr,ratio_political_sr,ratio_misc_sr,avg_upvote_ratio,avg_neg,avg_neu,avg_pos,avg_compound,node_type,is_sus
0,0,*polhold00797,AskReddit,{'AskReddit'},1,0,1,1,0,0,1.0,0.0,0.0,,0.000000,0.000000,1.000000,0.623900,user,False
1,1,*polhold01103,politics,{'politics'},1,0,1,1,0,0,1.0,0.0,0.0,,0.000000,1.000000,0.000000,0.000000,user,False
2,2,*polhold02060,science,{'science'},1,0,4,1,0,0,1.0,0.0,0.0,,0.050750,0.838250,0.110750,0.318000,user,False
3,3,-------------------7,nyc,{'nyc'},1,0,1,0,0,1,0.0,0.0,1.0,,0.076000,0.665000,0.260000,0.796400,user,False
4,4,------------------GL,canadaguns,"{'therewasanattempt', 'canadaguns'}",2,0,26,1,0,1,0.5,0.0,0.5,,0.059346,0.776769,0.163923,0.200531,user,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681289,1681289,zzzzzxx,nvidia,{'nvidia'},1,2,1,0,0,1,0.0,0.0,1.0,0.3,0.022333,0.862000,0.115333,0.600333,user,False
1681290,1681290,zzzzzzzz33bbbbbbb12,mildlyinteresting,{'mildlyinteresting'},1,0,4,1,0,0,1.0,0.0,0.0,,0.000000,0.648250,0.351750,0.367800,user,False
1681291,1681291,zzzzzzzzzz555,classiccars,{'classiccars'},1,0,1,0,0,1,0.0,0.0,1.0,,0.391000,0.609000,0.000000,-0.670500,user,False
1681292,1681292,zzzzzzzzzzzzzzz69,atheism,"{'Firearms', 'atheism'}",2,2,4,0,0,2,0.0,0.0,1.0,0.9,0.062333,0.841500,0.096167,-0.002833,user,False


In [16]:
usr_df[usr_df["is_sus"] == True]

Unnamed: 0,Unnamed: 0.1,usr,top_sr,sr_set,num_sr,num_posts,num_comments,num_popular_sr,num_political_sr,num_misc_sr,ratio_popular_sr,ratio_political_sr,ratio_misc_sr,avg_upvote_ratio,avg_neg,avg_neu,avg_pos,avg_compound,node_type,is_sus
7555,7555,1488reasons,newzealand,"{'newzealand', 'NorthAmerican', 'environment'}",3,0,183,0,0,3,0.000000,0.000000,1.000000,,0.112120,0.796858,0.090995,-0.064072,user,True
29722,29722,Abena_Tau,Blackpeople,"{'blackculture', 'Blackpeople', 'worldnews', '...",19,35,0,2,1,15,0.105263,0.052632,0.789474,0.909429,0.208629,0.743514,0.047857,-0.283026,user,True
39784,39784,AdoraronDoomworker,funny,"{'pics', 'funny', 'gifs', 'Bad_Cop_No_Donut'}",4,8,0,3,1,0,0.750000,0.250000,0.000000,0.761250,0.208500,0.661750,0.129875,-0.093150,user,True
43712,43712,AgaluneMalordred,Bad_Cop_No_Donut,"{'PoliticalHumor', 'Bad_Cop_No_Donut', 'funny'...",5,9,0,2,2,1,0.400000,0.400000,0.200000,0.964444,0.066889,0.772333,0.161000,0.097422,user,True
43713,43713,AgamagelvTozshura,gifs,"{'obama', 'PoliticalHumor', 'MRW', 'WTF', 'tod...",8,19,0,4,2,2,0.500000,0.250000,0.250000,0.652105,0.125789,0.804947,0.069263,-0.099168,user,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640892,1640892,vsrruslan,police,"{'test', 'police'}",2,2,0,0,0,2,0.000000,0.000000,1.000000,0.665000,0.000000,1.000000,0.000000,0.000000,user,True
1640893,1640893,vsruslan,test,"{'test', 'elections', 'lgbt'}",3,4,0,0,0,3,0.000000,0.000000,1.000000,1.000000,0.000000,0.924500,0.075500,0.095450,user,True
1641937,1641937,wadeharriot,Conservative,"{'television', 'worldnews', 'promos', 'Jokes',...",14,8,17,10,2,1,0.714286,0.142857,0.071429,0.856250,0.058560,0.798080,0.143400,0.151320,user,True
1643104,1643104,walterwhite1962,snapleaks,{'snapleaks'},1,1,0,0,0,1,0.000000,0.000000,1.000000,0.850000,0.486000,0.000000,0.514000,0.051600,user,True


### Run again

In [69]:
usr_df = pd.read_csv("usr_node_attr.csv")
try:
    usr_df.drop(labels=["Unnamed: 0"], axis=1, inplace=True)
except:
    pass
usr_df

Unnamed: 0,usr,top_sr,sr_set,num_sr,num_posts,num_comments,num_popular_sr,num_political_sr,num_misc_sr,ratio_popular_sr,ratio_political_sr,ratio_misc_sr,avg_upvote_ratio,avg_neg,avg_neu,avg_pos,avg_compound,node_type,is_sus
0,*polhold00797,AskReddit,{'AskReddit'},1,0,1,1,0,0,1.0,0.0,0.0,,0.000000,0.000000,1.000000,0.623900,user,False
1,*polhold01103,politics,{'politics'},1,0,1,1,0,0,1.0,0.0,0.0,,0.000000,1.000000,0.000000,0.000000,user,False
2,*polhold02060,science,{'science'},1,0,4,1,0,0,1.0,0.0,0.0,,0.050750,0.838250,0.110750,0.318000,user,False
3,-------------------7,nyc,{'nyc'},1,0,1,0,0,1,0.0,0.0,1.0,,0.076000,0.665000,0.260000,0.796400,user,False
4,------------------GL,canadaguns,"{'therewasanattempt', 'canadaguns'}",2,0,26,1,0,1,0.5,0.0,0.5,,0.059346,0.776769,0.163923,0.200531,user,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681289,zzzzzxx,nvidia,{'nvidia'},1,2,1,0,0,1,0.0,0.0,1.0,0.3,0.022333,0.862000,0.115333,0.600333,user,False
1681290,zzzzzzzz33bbbbbbb12,mildlyinteresting,{'mildlyinteresting'},1,0,4,1,0,0,1.0,0.0,0.0,,0.000000,0.648250,0.351750,0.367800,user,False
1681291,zzzzzzzzzz555,classiccars,{'classiccars'},1,0,1,0,0,1,0.0,0.0,1.0,,0.391000,0.609000,0.000000,-0.670500,user,False
1681292,zzzzzzzzzzzzzzz69,atheism,"{'Firearms', 'atheism'}",2,2,4,0,0,2,0.0,0.0,1.0,0.9,0.062333,0.841500,0.096167,-0.002833,user,False


In [70]:
usr_cols = [col for col in usr_df.columns]
usr_cols.remove("node_type")
usr_cols

['usr',
 'top_sr',
 'sr_set',
 'num_sr',
 'num_posts',
 'num_comments',
 'num_popular_sr',
 'num_political_sr',
 'num_misc_sr',
 'ratio_popular_sr',
 'ratio_political_sr',
 'ratio_misc_sr',
 'avg_upvote_ratio',
 'avg_neg',
 'avg_neu',
 'avg_pos',
 'avg_compound',
 'is_sus']

In [71]:
sr_df = pd.read_csv("sr_node_attr.csv")
try:
    sr_df.drop(labels=["Unnamed: 0"], axis=1, inplace=True)
except:
    pass
sr_df

Unnamed: 0,Subreddit,Title,Public Description,Description,topic_type,node_type
0,NeutralPolitics,Neutral Politics: Evidence. Logic. Respect.,Neutral Politics is a community dedicated to e...,##What is Neutral Politics?\n\nNeutral Politic...,political,subreddit
1,Centrist,Centrist Reddit,A subreddit for those who gravitate to the mid...,"Finally, a Reddit for those of us in the middl...",political,subreddit
2,ModeratePolitics,Restore Sanity in Politics!,This is NOT a politically moderate subreddit! ...,"Started by u/sockthepuppetry in 2011, this sub...",political,subreddit
3,PeoplesParty,People's Party,A sub to discuss Maxime Bernier's People's Par...,Welcome to r/PeoplesParty. We're a subreddit d...,political,subreddit
4,Communism,COMMUNISM,For the theory and practice of Marxism.,### [Please read the rules **before** posting....,political,subreddit
...,...,...,...,...,...,...
1349,redditrequest,RedditRequest - Adopt an unmoderated community!,This subreddit is for requesting moderation pr...,**The current review time for requests is 5 da...,misc,subreddit
1350,toosoon,,,,misc,subreddit
1351,dogemarket,DogeMarket,buy and sell things with dogecoin!\n\nRead the...,> **Subreddit Style**\n\n> You are not using t...,misc,subreddit
1352,dogemining,Dogecoin Mining,Dogecoin Mining Subreddit. Many digs. Much Dog...,### Guides\n* [General Mining Guide](/r/dogemi...,misc,subreddit


In [72]:
sr_cols = [col for col in sr_df.columns]
sr_cols.remove("node_type")
sr_cols

['Subreddit', 'Title', 'Public Description', 'Description', 'topic_type']

In [73]:
ordered_cols = ['node_type'] + usr_cols + sr_cols
ordered_cols

['node_type',
 'usr',
 'top_sr',
 'sr_set',
 'num_sr',
 'num_posts',
 'num_comments',
 'num_popular_sr',
 'num_political_sr',
 'num_misc_sr',
 'ratio_popular_sr',
 'ratio_political_sr',
 'ratio_misc_sr',
 'avg_upvote_ratio',
 'avg_neg',
 'avg_neu',
 'avg_pos',
 'avg_compound',
 'is_sus',
 'Subreddit',
 'Title',
 'Public Description',
 'Description',
 'topic_type']

In [74]:
### Add all attributes to both node types, but values will be NaN for the non-node type attributes
for col in usr_cols:
    if col not in sr_df.columns:
        sr_df[col] = 0 
sr_df['topic_type'].replace('political', 0, inplace=True)
sr_df['topic_type'].replace('popular', 1, inplace=True)
sr_df['topic_type'].replace('misc', 2, inplace=True)
sr_df['node_type'].replace('user', 0, inplace=True)
sr_df['node_type'].replace('subreddit', 1, inplace=True)
sr_df = sr_df.reindex(ordered_cols, axis=1)
sr_df

Unnamed: 0,node_type,usr,top_sr,sr_set,num_sr,num_posts,num_comments,num_popular_sr,num_political_sr,num_misc_sr,...,avg_neg,avg_neu,avg_pos,avg_compound,is_sus,Subreddit,Title,Public Description,Description,topic_type
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,NeutralPolitics,Neutral Politics: Evidence. Logic. Respect.,Neutral Politics is a community dedicated to e...,##What is Neutral Politics?\n\nNeutral Politic...,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,Centrist,Centrist Reddit,A subreddit for those who gravitate to the mid...,"Finally, a Reddit for those of us in the middl...",0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,ModeratePolitics,Restore Sanity in Politics!,This is NOT a politically moderate subreddit! ...,"Started by u/sockthepuppetry in 2011, this sub...",0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,PeoplesParty,People's Party,A sub to discuss Maxime Bernier's People's Par...,Welcome to r/PeoplesParty. We're a subreddit d...,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,Communism,COMMUNISM,For the theory and practice of Marxism.,### [Please read the rules **before** posting....,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1349,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,redditrequest,RedditRequest - Adopt an unmoderated community!,This subreddit is for requesting moderation pr...,**The current review time for requests is 5 da...,2
1350,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,toosoon,,,,2
1351,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,dogemarket,DogeMarket,buy and sell things with dogecoin!\n\nRead the...,> **Subreddit Style**\n\n> You are not using t...,2
1352,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,dogemining,Dogecoin Mining,Dogecoin Mining Subreddit. Many digs. Much Dog...,### Guides\n* [General Mining Guide](/r/dogemi...,2


In [75]:
### Add all attributes to both node types, but values will be NaN for the non-node type attributes
for col in sr_cols:
    if col not in usr_df.columns:
        usr_df[col] = 0
usr_df['node_type'].replace('user', 0, inplace=True)
usr_df['node_type'].replace('subreddit', 1, inplace=True)
usr_df['is_sus'].replace(False, 0, inplace=True)
usr_df['is_sus'].replace(True, 1, inplace=True)
usr_df = usr_df.reindex(ordered_cols, axis=1)
usr_df

Unnamed: 0,node_type,usr,top_sr,sr_set,num_sr,num_posts,num_comments,num_popular_sr,num_political_sr,num_misc_sr,...,avg_neg,avg_neu,avg_pos,avg_compound,is_sus,Subreddit,Title,Public Description,Description,topic_type
0,0,*polhold00797,AskReddit,{'AskReddit'},1,0,1,1,0,0,...,0.000000,0.000000,1.000000,0.623900,0,0,0,0,0,0
1,0,*polhold01103,politics,{'politics'},1,0,1,1,0,0,...,0.000000,1.000000,0.000000,0.000000,0,0,0,0,0,0
2,0,*polhold02060,science,{'science'},1,0,4,1,0,0,...,0.050750,0.838250,0.110750,0.318000,0,0,0,0,0,0
3,0,-------------------7,nyc,{'nyc'},1,0,1,0,0,1,...,0.076000,0.665000,0.260000,0.796400,0,0,0,0,0,0
4,0,------------------GL,canadaguns,"{'therewasanattempt', 'canadaguns'}",2,0,26,1,0,1,...,0.059346,0.776769,0.163923,0.200531,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681289,0,zzzzzxx,nvidia,{'nvidia'},1,2,1,0,0,1,...,0.022333,0.862000,0.115333,0.600333,0,0,0,0,0,0
1681290,0,zzzzzzzz33bbbbbbb12,mildlyinteresting,{'mildlyinteresting'},1,0,4,1,0,0,...,0.000000,0.648250,0.351750,0.367800,0,0,0,0,0,0
1681291,0,zzzzzzzzzz555,classiccars,{'classiccars'},1,0,1,0,0,1,...,0.391000,0.609000,0.000000,-0.670500,0,0,0,0,0,0
1681292,0,zzzzzzzzzzzzzzz69,atheism,"{'Firearms', 'atheism'}",2,2,4,0,0,2,...,0.062333,0.841500,0.096167,-0.002833,0,0,0,0,0,0


In [76]:
# Re-apply node attributes if new node attributes defined
name = "mega"
#name = "sus_usr_subreddits_posts"

out_dir = os.path.join(data_path, out_path, name)
mega_net_df = pd.read_csv(os.path.join(out_dir, f"{name}_bip.csv"))
# G = nx.from_pandas_edgelist(mega_net_df, 'node_x', 'node_y', edge_attr=['weight'])
# G = add_node_attr(sr_df, usr_df, G)
# nx.write_graphml(G, os.path.join(out_dir, f"{name}_bip.graphml"))
mega_net_df_top100000 = mega_net_df.iloc[:100000,:]
G_top100000 = nx.from_pandas_edgelist(mega_net_df_top100000, 'node_x', 'node_y', edge_attr=['weight'])
G_top100000 = add_node_attr(sr_df, usr_df, G_top100000)
nx.write_graphml(G_top100000, os.path.join(out_dir, f"{name}_bip_top100000.graphml"))

Setting subreddit attributes...
Setting user attributes...
Setting default 0 values for nodes that didn't have attributes, marking node_type as subreddit...


In [67]:
usr_df[usr_df['usr'] == 'shomyo']

Unnamed: 0,node_type,usr,top_sr,sr_set,num_sr,num_posts,num_comments,num_popular_sr,num_political_sr,num_misc_sr,...,avg_neg,avg_neu,avg_pos,avg_compound,is_sus,Subreddit,Title,Public Description,Description,topic_type
1532837,0,shomyo,uncen,"{'linux', 'witcher', 'rule34', 'cringepics', '...",91,996,398,19,1,66,...,0.188055,0.718294,0.092225,-0.206623,1,,,,,


## Make all from scratch

In [11]:
def make_networks(df_clean, name):
        print(f"~~~ Making {name} network ~~~~")
        out_dir = os.path.join(data_path, out_path, name)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        sr_df, usr_df, df = get_node_attr(df_clean)
        # Bipartite
        df = make_bipartite_graph(df_clean)
        df.to_csv(os.path.join(out_dir, f"{name}_bip.csv"), index=False)
        G = nx.from_pandas_edgelist(df, 'node_x', 'node_y', edge_attr=['weight'])
        G = add_node_attr(sr_df, usr_df, G)
        nx.write_graphml(G, os.path.join(out_dir, f"{name}_bip.graphml"))
        if df.shape[0] > 100000:
            df_top100000 = df.iloc[:100000,:]
            G_top100000 = nx.from_pandas_edgelist(df_top100000, 'node_x', 'node_y', edge_attr=['weight'])
            G_top100000 = add_node_attr(sr_df, usr_df, G_top100000)
            nx.write_graphml(G_top100000, os.path.join(out_dir, f"{name}_bip_top100000.graphml"))

# Separate per csv
for fn in os.listdir(data_path):
    if fn.endswith(".csv"):
        name = fn.split('.')[0]
        type_sr = fn.split('_subreddits')[0]
        df = pd.read_csv(os.path.join(data_path, fn))
        print(f"=== Parsing {fn} {df.shape} ===")
        df_clean = cleanup(df)
        # analyze(df_clean)
        mega_df = pd.concat([mega_df, df_clean])
        combo_category_df[type_sr] = pd.concat([combo_category_df[type_sr], df_clean])
        if os.path.exists(os.path.join(data_path, out_path, name)):
            continue
        # df_clean = filter_min_commenters(df_clean)
        make_networks(df_clean, name)
        print("\n")


# Separate per category
for sr,df in combo_category_df.items():
    # df_clean = filter_min_commenters(df)
    make_networks(df_clean, sr)
    print("\n")
     
# Merge all
mega_df = filter_min_commenters(mega_df)
name = "mega"
make_networks(mega_df, name)

=== Parsing political_subreddits_new.csv (282927, 16) ===
Original df size: (282927, 16)
After removing messed up rows: (282927, 16)
After removing known bots df size: (278127, 16)
After drop dup author/link id and NaN df size: (269620, 12)
=== Parsing popular_subreddits_new.csv (375710, 16) ===
Original df size: (375710, 16)
After removing messed up rows: (375710, 16)
After removing known bots df size: (374114, 16)
After drop dup author/link id and NaN df size: (366073, 12)


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

# Sentiment Analysis

In [21]:
data_path = "data_scraping\\data_1month" # Change this to path where .csv are located
# Separate per csv
for fn in os.listdir(data_path):
    if fn.endswith(".csv"):
        name = fn.split('.')[0]
        df = pd.read_csv(os.path.join(data_path, fn))
        print(f"=== Parsing {fn} {df.shape} ===")
        df_sia = run_sentiment_analysis(df)
        df_sia.to_csv(os.path.join(data_path, fn))

=== Parsing political_subreddits_new.csv (282927, 16) ===
Concat content..
Obtaining polarity scores
Creating new dataframe...
=== Parsing popular_subreddits_new.csv (375710, 16) ===
Concat content..
Obtaining polarity scores
Creating new dataframe...
=== Parsing sus_usr_subreddits_0_new.csv (687008, 16) ===
Concat content..
Obtaining polarity scores
Creating new dataframe...
=== Parsing sus_usr_subreddits_10_new.csv (49622, 16) ===
Concat content..
Obtaining polarity scores
Creating new dataframe...
=== Parsing sus_usr_subreddits_1_new.csv (667336, 16) ===
Concat content..
Obtaining polarity scores
Creating new dataframe...
=== Parsing sus_usr_subreddits_2_new.csv (238142, 16) ===
Concat content..
Obtaining polarity scores
Creating new dataframe...
=== Parsing sus_usr_subreddits_3_new.csv (995680, 16) ===
Concat content..
Obtaining polarity scores
Creating new dataframe...
=== Parsing sus_usr_subreddits_4_new.csv (847003, 16) ===
Concat content..
Obtaining polarity scores
Creating new

# Toxicity Analysis

In [18]:
data_path = "data_scraping\\data_1month" # Change this to path where .csv are located
# Separate per csv
for fn in os.listdir(data_path):
    if fn.endswith(".csv"):
        name = fn.split('.')[0]
        df = pd.read_csv(os.path.join(data_path, fn), index_col=0)
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        print(f"=== [{datetime.now()}] Parsing {fn} {df.shape} ===")
        df_toxic = run_toxic_analysis(df)
        df_toxic.to_csv(os.path.join(data_path, fn))

=== [2023-11-26 13:52:37.523143] Parsing political_subreddits_new.csv (282927, 20) ===
Concat content..
Obtaining toxicity scores on size (282927, 20)


KeyboardInterrupt: 

# Similarity Analysis

In [71]:
data_path = "data_scraping\\data_1month" # Change this to path where .csv are located
out_path = os.path.join(data_path, 'usr_comment_similarity.csv')
# Separate per csv
for fn in os.listdir(data_path):
    if fn.endswith(".csv"):
        name = fn.split('.')[0]
        df = pd.read_csv(os.path.join(data_path, fn), index_col=0)
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        print(f"=== [{datetime.now()}] Parsing {fn} {df.shape} ===")
        df = find_comment_similarity(df)
        if os.path.isfile(out_path):
            df.to_csv(out_path, mode='a', header=False, index=False)
        else:
            df.to_csv(out_path, header=False, index=False)

=== [2023-11-26 17:41:30.822883] Parsing political_subreddits_new.csv (282927, 20) ===
Calculating similarity on size (103488, 2)
