# Part 2: Find Influencers from Reddit

### Select the '0sanitymemes' as our subreddit

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import zstandard as zstd
import numpy as np

# Function to read a .zst compressed file into a pandas DataFrame
def read_zst(file_path):
    with open(file_path, 'rb') as fh:
        dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
        with dctx.stream_reader(fh) as reader:
            return pd.read_json(reader, lines=True)

filepath_to_submissions = '0sanitymemes_submissions.zst'
filepath_to_comments = '0sanitymemes_comments.zst'

df_submissions = read_zst(filepath_to_submissions)
df_comments = read_zst(filepath_to_comments)


In [5]:
df_submissions.shape

(15451, 102)

In [6]:
df_comments.shape

(188828, 59)

In [7]:
df_submissions.columns

Index(['all_awardings', 'allow_live_comments', 'archived', 'author',
       'author_created_utc', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color',
       ...
       'is_gallery', 'awarders', 'banned_by', 'can_mod_post', 'removed_by',
       'retrieved_on', 'subreddit_name_prefixed', 'view_count', 'collections',
       'call_to_action'],
      dtype='object', length=102)

In [8]:
df_submissions[:10].head()

Unnamed: 0,all_awardings,allow_live_comments,archived,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_template_id,author_flair_text,author_flair_text_color,...,is_gallery,awarders,banned_by,can_mod_post,removed_by,retrieved_on,subreddit_name_prefixed,view_count,collections,call_to_action
0,[],True,True,[deleted],,,,,,dark,...,,,,,,,,,,
1,[],False,True,szechein,1585930000.0,,,,,,...,,,,,,,,,,
2,[],False,True,UnderpaidMook,1591861000.0,#cc5289,,dffa0ec2-afb3-11ea-8542-0ea276052b53,#JusticeForKevin,light,...,,,,,,,,,,
3,[],False,True,Bombywolf,1539660000.0,#0079d3,,e674a67c-afb3-11ea-82d4-0e3ee59aea75,"""Dokutah, is there something under the desk?""",light,...,,,,,,,,,,
4,[],False,True,Infinitale,1500000000.0,#ea0027,,0289e858-dcfb-11ea-bb9f-0e65462c93c9,The Talulah of 0SanityMemes,light,...,,,,,,,,,,


In [9]:
df_submissions.head()

Unnamed: 0,all_awardings,allow_live_comments,archived,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_template_id,author_flair_text,author_flair_text_color,...,is_gallery,awarders,banned_by,can_mod_post,removed_by,retrieved_on,subreddit_name_prefixed,view_count,collections,call_to_action
0,[],True,True,[deleted],,,,,,dark,...,,,,,,,,,,
1,[],False,True,szechein,1585930000.0,,,,,,...,,,,,,,,,,
2,[],False,True,UnderpaidMook,1591861000.0,#cc5289,,dffa0ec2-afb3-11ea-8542-0ea276052b53,#JusticeForKevin,light,...,,,,,,,,,,
3,[],False,True,Bombywolf,1539660000.0,#0079d3,,e674a67c-afb3-11ea-82d4-0e3ee59aea75,"""Dokutah, is there something under the desk?""",light,...,,,,,,,,,,
4,[],False,True,Infinitale,1500000000.0,#ea0027,,0289e858-dcfb-11ea-bb9f-0e65462c93c9,The Talulah of 0SanityMemes,light,...,,,,,,,,,,


In [10]:
df_comments.columns

Index(['all_awardings', 'associated_award', 'author', 'author_created_utc',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'body', 'can_gild', 'can_mod_post', 'collapsed',
       'collapsed_because_crowd_control', 'collapsed_reason', 'comment_type',
       'controversiality', 'created_utc', 'distinguished', 'edited', 'gilded',
       'gildings', 'id', 'is_submitter', 'link_id', 'locked', 'no_follow',
       'parent_id', 'permalink', 'quarantined', 'removal_reason',
       'retrieved_on', 'score', 'send_replies', 'stickied', 'subreddit',
       'subreddit_id', 'subreddit_name_prefixed', 'subreddit_type',
       'top_awarded_type', 'total_awards_received', 'treatment_tags',
       'author_cakeday', 'archived', 'collapsed_reason_code', 'name',
       

In [11]:
df_comments.head()

Unnamed: 0,all_awardings,associated_award,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,...,archived,collapsed_reason_code,name,retrieved_utc,score_hidden,author_is_blocked,unrepliable_reason,editable,media_metadata,Link Type
0,[],,szechein,,,,[],,,,...,,,,,,,,,,respond to a comment
1,[],,UnderpaidMook,,#cc5289,,[],dffa0ec2-afb3-11ea-8542-0ea276052b53,#JusticeForKevin,light,...,,,,,,,,,,respond to a comment
2,[],,Infinitale,1500000000.0,#ea0027,,[],0289e858-dcfb-11ea-bb9f-0e65462c93c9,The Talulah of 0SanityMemes,light,...,,,,,,,,,,respond to a comment
3,[],,UnderpaidMook,,#cc5289,,[],dffa0ec2-afb3-11ea-8542-0ea276052b53,#JusticeForKevin,light,...,,,,,,,,,,respond to a comment
4,[],,Infinitale,1500000000.0,#ea0027,,[],0289e858-dcfb-11ea-bb9f-0e65462c93c9,The Talulah of 0SanityMemes,light,...,,,,,,,,,,respond to a comment


### Merge the submissions and comments data

In [2]:
# Create a dictionary to quickly check if a parent_id is a submission
submission_id_set = set(df_submissions['id'])

# Function to categorize the link type
def categorize_link(parent_id):
    # Prefix 't3_' is for submissions, 't1_' for comments in Reddit's API
    if parent_id in submission_id_set:
        return 'respond to a submission'
    else:
        return 'respond to a comment'

# Apply the function to the comments DataFrame
df_comments['Link Type'] = df_comments['parent_id'].apply(categorize_link)

# Construct the data for network analysis
network_data = df_comments[['id', 'parent_id', 'Link Type']]
network_data.rename(columns={'id': 'Child ID', 'parent_id': 'Parent ID'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  network_data.rename(columns={'id': 'Child ID', 'parent_id': 'Parent ID'}, inplace=True)


### Network Analysis

In [3]:
# Create a graph for the network analysis
G = nx.from_pandas_edgelist(network_data, source='Parent ID', target='Child ID', edge_attr='Link Type', create_using=nx.DiGraph())

# Information about the graph
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())
print("Network density:", nx.density(G))

Number of nodes: 273680
Number of edges: 188828
Network density: 2.5210523141837023e-06
