In [33]:
# Import code libraries to send requests, to clean data, convert dates, etc...
import requests
import re
from pprint import pprint
from datetime import datetime, timedelta

# Function for date conversion (pushshift.io reddit database server requires posix style date)
def date2posix(date):
    return int(datetime.strptime(date, '%Y-%m-%d').strftime('%s'))

def posix2date(posix):
    return datetime.utcfromtimestamp(posix).strftime('%Y-%m-%d')

def clean(text):
    return re.sub('[^a-zA-Z0-9 .,@!?%&\t\n;:\-())]', ' ', text)

# Question 1:

## 1) Retrieve the data

In [34]:
# Prepare request to send download the reddit comments from the website database
query_arguments = {'q':         'global warming',
                   'subreddit': 'politics',
                   'sort':      'asc',
                   'size':      '3000',
                   'after':     date2posix('2019-1-1'),
                   'before':    date2posix('2019-4-1')}

# Send request
pol_response = requests.get('https://api.pushshift.io/reddit/search/comment', params=query_arguments)

# Retrieve the 'data' field from request
pol_comments = pol_response.json().get('data')

query_arguments = {'q':         'global warming',
                   'subreddit': 'The_Donald',
                   'sort':      'asc',
                   'size':      '3000',
                   'after':     date2posix('2019-1-1'),
                   'before':    date2posix('2019-4-1')}

# Send request
td_response = requests.get('https://api.pushshift.io/reddit/search/comment', params=query_arguments)

# Retrieve the 'data' field from request
td_comments = td_response.json().get('data')

# e.g the first comment in this list of comments:
print('First comment:')
print(td_comments[1].get('body'))
print(pol_comments[1].get('body'))

First comment:
Canada is turning into a socialist hellhole.  Free speech is done. Global warming is our burden somehow.  Immigrants leaking in like the Titantic.
In terms of priority, yes. In terms of importance, no.

Let me explain. I deeply care about healtcare, war and stopping global warming... but with a Congress that’s accountable to corporations before people, we will not get the solutions we need.  

Basically what I’m saying is: let’s fix Congress so that they actually do what is right. Otherwise it’s like adding water to a pool that has a huge crack in its base. Sure, it will be usable for longer, but sooner or later it’s unsustainable


## 2) Named Entity Relation Extraction

In [37]:
import pandas as pd
import spacy
from collections import Counter
sp = spacy.load("en_core_web_sm")


entities_td = []

# for each comment:
for comment in td_comments:
    
    # extract the named entities
    entities = sp(comment['body'])
    
    # For each entity add it to the list to be counted
    for word in entities.ents:
        #print(word.text, word.label_)
        
        # Filter for person only entities 
        if word.label_ == 'PERSON':
            entities_td.append(word.text + ' ' + word.label_)

# Count the different entities in the list
counted_entities = Counter(entities_td)

# Print the 10 most common found entities
counted_entities.most_common(10)

[('Al Gore PERSON', 22),
 ('Trump PERSON', 9),
 ('Global PERSON', 9),
 ('Maurice Strong PERSON', 6),
 ('Crichton PERSON', 5),
 ('Hillary PERSON', 5),
 ('Obama PERSON', 4),
 ('Soros PERSON', 4),
 ('Left PERSON', 4),
 ('Agenda 21 PERSON', 3)]

To create a word tree form text https://www.jasondavies.com/wordtree/

# Question 2

## 1) Retrieve the data

In [49]:
# Prepare request to send download the reddit comments from the website database
query_arguments = {'q':         'global warming',
                   'subreddit': 'politics',
                   'sort':      'asc',
                   'size':      '100',
                   'after':     date2posix('2019-1-1'),
                   'before':    date2posix('2019-4-1')}

# Send request
pol_response = requests.get('https://api.pushshift.io/reddit/search/comment', params=query_arguments)

# Retrieve the 'data' field from request
pol_comments = pol_response.json().get('data')

query_arguments = {'q':         'global warming',
                   'subreddit': 'The_Donald',
                   'sort':      'asc',
                   'size':      '100',
                   'after':     date2posix('2019-1-1'),
                   'before':    date2posix('2019-4-1')}

# Send request
td_response = requests.get('https://api.pushshift.io/reddit/search/comment', params=query_arguments)

# Retrieve the 'data' field from request
td_comments = td_response.json().get('data')

# e.g the first comment in this list of comments:
print('First comment:')
print(td_comments[1].get('body'))
print(pol_comments[1].get('body'))

First comment:
Canada is turning into a socialist hellhole.  Free speech is done. Global warming is our burden somehow.  Immigrants leaking in like the Titantic.
In terms of priority, yes. In terms of importance, no.

Let me explain. I deeply care about healtcare, war and stopping global warming... but with a Congress that’s accountable to corporations before people, we will not get the solutions we need.  

Basically what I’m saying is: let’s fix Congress so that they actually do what is right. Otherwise it’s like adding water to a pool that has a huge crack in its base. Sure, it will be usable for longer, but sooner or later it’s unsustainable


## 2) extract causal relations

In [None]:
# Prepare request to send this comment(s) to the causal relation extractor

# prepare a list of texts to send (but clean it first)
texts_td = [clean(comment['body']) for comment in td_comments]
texts_pol = [clean(comment['body']) for comment in pol_comments]

# Send the request
json_data = {'texts': texts_td, 'frames': ['Causation']}
response  = requests.post('https://penelope.vub.be/semantic-frame-extractor/texts-extract-causes-effects', json=json_data)
data_td      = response.json()['causalRelations']

# Send the request
json_data = {'texts': texts_pol, 'frames': ['Causation']}
response  = requests.post('https://penelope.vub.be/semantic-frame-extractor/texts-extract-causes-effects', json=json_data)
data_pol      = response.json()['causalRelations']

# Create a list of causes and a list of effects
causes_td = []
effects_td = []
for frame in data_td:
    causes_td.append(frame['cause'])
    effects_td.append(frame['effect'])
    
# Create a list of causes and a list of effects
causes_pol = []
effects_pol = []
for frame in data_pol:
    causes_pol.append(frame['cause'])
    effectspol.append(frame['effect'])
    

# get the most occuring causes
# Instead of just counting sentences, you could cluster the different according to their similarity
counted_causes_td = Counter(causes_td).most_common(20)
counted_effects_td = Counter(effects_td).most_common(20)
print(counted_causes_td)

counted_causes_pol = Counter(causes_pol).most_common(20)
counted_effects_pol = Counter(effects_pol).most_common(20)
print(counted_causes_pol)

## 3) Plot the most frequent effects or causes

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# create a barchart of the most frequent effects of subreddit the_donald

keys = counted_effects_td.keys()
y_pos = np.arange(len(keys))
counts = [counted_effects_td[k] for k in keys]
error = np.random.rand(len(keys))

plt.barh(y_pos, counts, alpha=0.4)
plt.yticks(y_pos, keys)

plt.show()

In [None]:
# create a barchart of the most frequent effects of subreddit politics

keys = counted_effects_pol.keys()
y_pos = np.arange(len(keys))
counts = [counted_effects_pol[k] for k in keys]
error = np.random.rand(len(keys))

plt.barh(y_pos, counts, alpha=0.4)
plt.yticks(y_pos, keys)

plt.show()