# Data Collection - Wildfires & Climate Change 

- developer: name
- personal-use-script: A
- secret: ABC

In [1]:
import praw
import pandas as pd
from datetime import datetime 
import networkx as nx 
import pickle
import pydot
from networkx.drawing.nx_pydot import graphviz_layout
import glob
import matplotlib.pyplot as plt
import matplotlib

In [2]:
reddit = praw.Reddit(client_id='A',
                     client_secret='ABC',
                     user_agent='name',
                     username= 'name', 
                     password= 'name'
                    )

In [3]:
sub_list = [
 'climatechange',
 'climateskeptics'
]

## Comment Dataframe for multiple Subreddits

* Iterating through a list of subreddits
* Iterating through unlimmited number of hot submissions that contain the keyword "fire" in each subreddit
* Getting the comments for each submission
* Saving the author (if not deleted)
* Saving the comment body (if not deleted)
* Append dataframe-list with comment id, body, author name, upvotes, timestamp, comment level (depth) and parent id and subreddit name

In [None]:
for sub in sub_list:
    subreddit = reddit.subreddit(sub)
    for c, submission in enumerate(subreddit.hot(limit=None)):
        if "fire" in submission.title:
            for c,comment in enumerate(submission.comments.list()):
                # THe try exists because some reddit comments are from authors who 
                # have deleted their account, but the comments persist. 
                try:
                    x = comment.author.name,
                    authorname = x[0]
                except AttributeError:
                    authorname = "[deleted]"
            
                try:
                    comment_body = comment.body,
                except AttributeError:
                    comment_body = "[deleted]"
            
                try: 
                    df_list.append([ \
                    comment.id,
                    comment_body,
                    authorname,
                    comment.ups,
                    comment.created_utc,
                    comment.depth,
                    comment.parent_id[3:],
                    subreddit
                    ])
                except AttributeError:
                    continue
                    
reddit_df = pd.DataFrame(df_list,columns=["id","body","authorname","ups","created_utc","depth","parent_id","subreddit"])

In [None]:
# create datetime object
reddit_df["date"] = reddit_df["created_utc"].map(lambda x: datetime.utcfromtimestamp(x))

display(reddit_df.head(10))
reddit_df.shape

Did this in April 2020 for Australian Wildfires and in October 2020 for Californian Wildfires (With same keyword settings)

In [None]:
# Store data (serialize)
with open('fire_reddit_df.pickle', 'wb') as handle:
    pickle.dump(reddit_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

# also save datafrate as csv 
reddit_df.to_csv('fire_reddit_comments_df.csv', encoding='utf-8', index=False)

## Create comment tree (acyclic graph) for each subreddit


In [None]:
for sub in sub_list:
    
    G = nx.DiGraph()
    
    node_set = set([])
    
    subreddit = reddit.subreddit(sub)
    
    for submission in subreddit.hot(limit=None):
        if "fire" in submission.title:
            submission.comments.replace_more(limit=None)
            
            for i in submission.comments.list():
                try:
                    G.add_node(i.id, depth=i.depth,name=i.author.name)
                except AttributeError: 
                    G.add_node(i.id, depth=i.depth,name="[Deleted]")
                node_set.add(i.id)
            
            for i in submission.comments.list():
                if i.parent_id and i.parent_id[3:] in node_set:
                    G.add_edge(i.id,i.parent_id[3:])
    
    print(subreddit,G.number_of_nodes(),'nodes')

    nx.write_gexf(G,('%s.gexf'%subreddit))

## Combine Australia and California Data

In [None]:
# Load reddit_df for Australia and California as pickles
with open('aus_fire_reddit_df.pickle', 'rb') as handle:
    reddit_df_aus = pickle.load(handle)

with open('cal_fire_reddit_df.pickle', 'rb') as handle:
    reddit_df_cal = pickle.load(handle)


reddit_df_aus['wave'] = 'australia'  
reddit_df_cal['wave'] = 'california' 
        
reddit_df = reddit_df_aus.append(reddit_df_cal)  

In [13]:
# Store data (serialize)
with open('reddit_df.pickle', 'wb') as handle:
    pickle.dump(reddit_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

# save datafrate as csv 
reddit_df.to_csv('reddit_df.csv', encoding='utf-8', index=False)

## Find original submission to comment trees

In [None]:
with open('reddit_df.pickle', 'rb') as handle:
    reddit_df = pickle.load(handle)
    
display(reddit_df)    

## Climatechange Australia

In [None]:
g = nx.read_gexf("climatechange_aus.gexf")
    
threads = pd.Series([g.subgraph(c) for c in nx.weakly_connected_component_subgraphs(g)])
    
subreddit_threads = []
all_trigger_subs_df = []
    
for c,g in enumerate(threads):
    
    comment_branch = list(nx.dag_longest_path(g))
    lpcb_df = reddit_df[reddit_df["id"].isin(comment_branch)]
    temp_sorted = lpcb_df.sort_values("depth",ascending=True)    
    
    count = 0
    for index,row in temp_sorted.iterrows():
        if count < 1:
            c = row.parent_id                         
            subreddit_threads.append(c)
            count += 1
            
    # Run another API call to get trigger submissions    
    
    subs = {}    
        
    for cs in subreddit_threads:
        s = reddit.submission(id = '{}'.format(cs))
        subs[cs] = (s.id, s.title, s.author, s.ups, s.num_comments)    
        cs_df = pd.DataFrame(subs, index=["id","title","author","ups_sub", "n_comments_sub"]).T
        all_trigger_subs_df.append(cs_df)

all_trigger_subs_df = pd.concat(all_trigger_subs_df) 

display(all_trigger_subs_df)

# Store data (serialize)
with open('trigger_climatechange_aus.pickle', 'wb') as handle:
    pickle.dump(all_trigger_subs_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

# write to csv
all_trigger_subs_df.to_csv(r'trigger_climatechange_aus.csv', index = True, header=True)

## Climateskeptics Australia

In [None]:
g = nx.read_gexf("climateskeptics_aus.gexf")
    
threads = pd.Series([g.subgraph(c) for c in nx.weakly_connected_component_subgraphs(g)])
    
subreddit_threads = []
all_trigger_subs_df = []
    
for c,g in enumerate(threads):
    
    comment_branch = list(nx.dag_longest_path(g))
    lpcb_df = reddit_df[reddit_df["id"].isin(comment_branch)]
    temp_sorted = lpcb_df.sort_values("depth",ascending=True)    
    
    count = 0
    for index,row in temp_sorted.iterrows():
        if count < 1:
            c = row.parent_id                         
            subreddit_threads.append(c)
            count += 1
            
    # Run another API call to get trigger submissions    
    
    subs = {}    
        
    for cs in subreddit_threads:
        s = reddit.submission(id = '{}'.format(cs))
        subs[cs] = (s.id, s.title, s.author, s.ups, s.num_comments)    
        cs_df = pd.DataFrame(subs, index=["id","title","author","ups_sub", "n_comments_sub"]).T
        all_trigger_subs_df.append(cs_df)

all_trigger_subs_df = pd.concat(all_trigger_subs_df) 

display(all_trigger_subs_df)

# Store data (serialize)
with open('trigger_climateskeptics_aus.pickle', 'wb') as handle:
    pickle.dump(all_trigger_subs_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

# write to csv
all_trigger_subs_df.to_csv(r'trigger_climateskeptics_aus.csv', index = True, header=True)

## Climatechange California

In [None]:
g = nx.read_gexf("climatechange_cal.gexf")
    
threads = pd.Series([g.subgraph(c) for c in nx.weakly_connected_component_subgraphs(g)])
    
subreddit_threads = []
all_trigger_subs_df = []
    
for c,g in enumerate(threads):
    
    comment_branch = list(nx.dag_longest_path(g))
    lpcb_df = reddit_df[reddit_df["id"].isin(comment_branch)]
    temp_sorted = lpcb_df.sort_values("depth",ascending=True)    
    
    count = 0
    for index,row in temp_sorted.iterrows():
        if count < 1:
            c = row.parent_id                         
            subreddit_threads.append(c)
            count += 1
            
    # Run another API call to get trigger submissions    
    
    subs = {}    
        
    for cs in subreddit_threads:
        s = reddit.submission(id = '{}'.format(cs))
        subs[cs] = (s.id, s.title, s.author, s.ups, s.num_comments)    
        cs_df = pd.DataFrame(subs, index=["id","title","author","ups_sub", "n_comments_sub"]).T
        all_trigger_subs_df.append(cs_df)

all_trigger_subs_df = pd.concat(all_trigger_subs_df) 

display(all_trigger_subs_df)

# Store data (serialize)
with open('trigger_climatechange_cal.pickle', 'wb') as handle:
    pickle.dump(all_trigger_subs_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

# write to csv
all_trigger_subs_df.to_csv(r'trigger_climatechange_cal.csv', index = True, header=True)

## Climateskeptics California

In [None]:
g = nx.read_gexf("climateskeptics_cal.gexf")
    
threads = pd.Series([g.subgraph(c) for c in nx.weakly_connected_component_subgraphs(g)])
    
subreddit_threads = []
all_trigger_subs_df = []
    
for c,g in enumerate(threads):
    
    comment_branch = list(nx.dag_longest_path(g))
    lpcb_df = reddit_df[reddit_df["id"].isin(comment_branch)]
    temp_sorted = lpcb_df.sort_values("depth",ascending=True)    
    
    count = 0
    for index,row in temp_sorted.iterrows():
        if count < 1:
            c = row.parent_id                         
            subreddit_threads.append(c)
            count += 1
            
    # Run another API call to get trigger submissions    
    
    subs = {}    
        
    for cs in subreddit_threads:
        s = reddit.submission(id = '{}'.format(cs))
        subs[cs] = (s.id, s.title, s.author, s.ups, s.num_comments)    
        cs_df = pd.DataFrame(subs, index=["id","title","author","ups_sub", "n_comments_sub"]).T
        all_trigger_subs_df.append(cs_df)

all_trigger_subs_df = pd.concat(all_trigger_subs_df) 

display(all_trigger_subs_df)

# Store data (serialize)
with open('trigger_climateskeptics_cal.pickle', 'wb') as handle:
    pickle.dump(all_trigger_subs_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

# write to csv
all_trigger_subs_df.to_csv(r'trigger_climateskeptics_cal.csv', index = True, header=True)