# Sentence Extraction

In [53]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/COLX_523')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Getting Started

In [54]:
import time
import numpy as np
import pandas as pd
import pickle
import os.path as path
import csv
import random, os, json, csv
random.seed(523)

import nltk
nltk.download('stopwords', quiet=True) | nltk.download('punkt', quiet=True)
from nltk import sent_tokenize, word_tokenize
from sklearn.feature_extraction import DictVectorizer
from collections import Counter

In [55]:
def load_master(master_dir):
    """Returns three master files present in `master_dir`. If such directory does
    not exist, create one and three master files, and then return them. 
    """    
    with open(os.path.join(master_dir, 'submission.json'), 'r', encoding='utf-8') as f:
        submission_dict = json.load(f)
    with open(os.path.join(master_dir, 'delta_thread.json'), 'r', encoding='utf-8') as f:
        delta_thread_dict = json.load(f)
    with open(os.path.join(master_dir, 'log.json'), 'r') as f:
        log_dict = json.load(f)

    return submission_dict, delta_thread_dict, log_dict

In [56]:
MASTER_DIR = 'reddit_corpus' 
submission_dict, delta_thread_dict, log_dict = load_master(MASTER_DIR)

# Extract `climate change` submissions and threads

In [57]:
KEYWORD = 'climate change'
s_df = pd.DataFrame(submission_dict)
cc_idx = s_df[s_df['submission_title'].str.contains(KEYWORD, case=False)].index
cc_submission_ids = s_df.loc[cc_idx]['submission_id']
#print(cc_submission_ids.values)  # f0y5af  g0tkib  p7h645
cc_thread = []
for key in delta_thread_dict:
    s_id, _ = key.split('-')
    if s_id in cc_submission_ids.values:
        cc_thread.append(key)
print(len(cc_submission_ids), 'topics', len(cc_thread), 'threads')  # 15 (feb26)  --> 32 topics 41 threads (mar4)

76 topics 117 threads


# Extract the sentences for annotation

In [58]:
def get_submission_title(submission_id):
    idx = submission_dict['submission_id'].index(submission_id)
    return submission_dict['submission_title'][idx]

In [59]:
def get_submission_body(submission_id):
    idx = submission_dict['submission_id'].index(submission_id)
    return submission_dict['submission_body'][idx]

In [60]:
comment = delta_thread_dict['f0y5af-fh0f5n6'][0]
comment

{'author': 'yyzjertl',
 'comment': 'I mean, I know several Trump supporters who believe climate change is happening and is human caused, and think what Trump says about this issue is technically wrong.',
 'comment_id': 'fgzuyrk',
 'created_utc': 1581197912.0,
 'parent_id': 't3_f0y5af',
 'timestamp': '2020-02-08 21:38:32'}

In [62]:
# Extract the sentences for annotations
KEYWORDS = [
            'carbon emission', 'co2 emission', 'carbon footprint', 'responsibility', 'responsibilities',   # 'climate change', 
            'control', 'action', 'supporter', 'solution', 'ozone hole', 'green energy', 'nuclear energy',
            'fossil fuel',

            'trump', 'biden', 'country', 'countries', 'government', 'policy', 'bill', 'conservative', 
            'democrat', 'republican', 'state', 

            'consumer', 'people', 'individual', 'human', 'group', 'generation',

            'experiment', 'researcher', 'research', 'academia', 'study', 'studies', 'data',

            'company', 'corporation', 'industry', 'economy',
            ]

def contain_keyword(sent):
    flag = False
    for keyword in KEYWORDS:
        if keyword in sent:
            print(keyword)
            flag = True
            break
    return flag

In [63]:
MIN, MAX = 15, 200
max_length = 0  # max number of words within a sentence
word_count_raw1, word_count_filtered1 = 0, 0

with open('climate_change_raw.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'title', 'focus_sentence'])
    sent_count = 0
    for thread_id in cc_thread:
        s_idx, _ = thread_id.split('-')  # submissionID, commentID
        for comment_dict in delta_thread_dict[thread_id]:
            comment = comment_dict['comment'].replace('\n\n', ' ')
            for sent in sent_tokenize(comment):
                words = word_tokenize(sent)
                word_count_raw1 += len(words)
                if (MIN <= len(words) < MAX) and (contain_keyword(sent)):
                    writer.writerow([
                                    thread_id, 
                                    get_submission_title(s_idx), 
                                    sent,
                    ])
                    sent_count += 1
                    word_count_filtered1 += len(words)
                    if len(words) > max_length:
                        max_length = len(words)
    print(word_count_raw1, "raw words in the threads")
    print(word_count_filtered1, "words remained after filtering by sentence length and keywords")
    print("Maximum length of a sentence in the threads", max_length, "words")

    max_length = 0
    word_count_raw2, word_count_filtered2 = 0, 0
    for submission_id in cc_submission_ids.values:
        submission_body = get_submission_body(submission_id)
        for sent in sent_tokenize(submission_body):
              words = word_tokenize(sent)
              word_count_raw2 += len(words)
              if (MIN <= len(words) < MAX) and (contain_keyword(sent)):
                  writer.writerow([
                                  submission_id,
                                  get_submission_title(submission_id),
                                  sent
                  ])
                  sent_count += 1
                  word_count_filtered2 += len(words)
                  if len(words) > max_length:
                      max_length = len(words)
    print(word_count_raw2, "raw words in the submissions")
    print(word_count_filtered2, "words remained after filtering by sentence length and keywords")
    print("Maximum length of a sentence in the topic bodies", max_length, "words")

print(sent_count)

supporter
supporter
republican
trump
research
conservative
data
control
control
action
conservative
data
government
data
study
conservative
conservative
individual
individual
action
people
control
conservative
action
conservative
corporation
countries
countries
government
people
country
carbon emission
people
people
solution
policy
individual
fossil fuel
economy
fossil fuel
fossil fuel
action
consumer
responsibility
consumer
government
consumer
government
solution
human
human
bill
countries
state
fossil fuel
fossil fuel
human
bill
bill
bill
fossil fuel
action
responsibility
people
human
control
bill
people
bill
action
countries
countries
fossil fuel
fossil fuel
countries
control
action
carbon footprint
fossil fuel
fossil fuel
bill
carbon emission
human
people
consumer
company
company
people
consumer
green energy
people
people
consumer
fossil fuel
consumer
consumer
people
control
people
people
people
people
action
people
people
responsibility
responsibility
responsibility
responsibility