## Subsetting Data
by Santiago Segovia

Note: Run this notebook on Colab

In [1]:
import pandas as pd

from google.colab import drive

In [2]:
# Mount GDrive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
# Load data (takes 2 mins to load `comments`)
data_path = "/content/drive/Shareddrives/adv-ml-project/Data/"
comments = pd.read_csv(data_path + "the-reddit-climate-change-dataset-comments.csv")
posts = pd.read_csv(data_path + "the-reddit-climate-change-dataset-posts.csv")

In [4]:
# Define date
comments['date'] = pd.to_datetime(comments['created_utc'], unit='s')
posts['date'] = pd.to_datetime(posts['created_utc'], unit='s')

In [6]:
# Define label
def create_label(sentiment):
    if sentiment < -0.05:
        return -1
    elif sentiment >= -0.05 and sentiment <= 0.05:
        return 0
    else:
        return 1

In [28]:
comments['label'] = comments['sentiment'].apply(create_label)

In [9]:
initial_comments_shape = comments.shape
initial_posts_shape = posts.shape
print("Number of records in comments df:", initial_comments_shape[0])
print("Number of records in posts df:", initial_posts_shape[0])

Number of records in comments df: 4600698
Number of records in posts df: 620908


In [13]:
# We keep columns we'll use in the analysis
comments = comments[['subreddit.name','date','body','sentiment','label']]
posts = posts[['subreddit.name','date','title']]

In [16]:
# Subset by date (keep every record from 2015 onwards)
comments = comments[comments['date']>='2015-01-01']
posts = posts[posts['date']>='2015-01-01']

In [18]:
mid_comments_shape = comments.shape
mid_posts_shape = posts.shape
print("Number of records in comments df:", mid_comments_shape[0])
print(" Reduction of", round((initial_comments_shape[0] - mid_comments_shape[0]) * 100 / initial_comments_shape[0] - 1,2), "% vs. original")
print("Number of records in posts df:", mid_posts_shape[0])
print(" Reduction of", round((initial_posts_shape[0] - mid_posts_shape[0]) * 100 / initial_posts_shape[0] - 1,2), "% vs. original")

Number of records in comments df: 4338011
 Reduction of 4.71 % vs. original
Number of records in posts df: 566808
 Reduction of 7.71 % vs. original


In [19]:
# Subset by number of subreddits that have 5000 or more comments
def count_categories(categories):
    category_counts = {}
    for category in categories:
        if category in category_counts:
            category_counts[category] += 1
        else:
            category_counts[category] = 1

    return list(category_counts.items())

In [20]:
subreddits = count_categories(comments['subreddit.name'])
sorted_subreddits = sorted(subreddits, key=lambda x: x[1], reverse=True)

In [21]:
sorted_subreddits[:5]

[('politics', 339167),
 ('worldnews', 332778),
 ('askreddit', 240389),
 ('collapse', 92702),
 ('news', 89337)]

In [22]:
#Dropping subreddits that have less than "threshold" posts (e.g. 3K, 10K, etc)
def drop_tuples_below_threshold(tuples_list, threshold):
    to_keep = []
    cat_num = []
    for name, count in tuples_list:
        if count >= threshold:
          to_keep.append(name)
          cat_num.append((name, count))

    return to_keep, cat_num

In [23]:
categories, counts_categories  = drop_tuples_below_threshold(sorted_subreddits, 100000)

In [24]:
comments = comments[comments['subreddit.name'].isin(categories)]
posts = posts[posts['subreddit.name'].isin(categories)]

In [25]:
end_comments_shape = comments.shape
end_posts_shape = posts.shape
print("Number of records in comments df:", end_comments_shape[0])
print(" Reduction of", round((initial_comments_shape[0] - end_comments_shape[0]) * 100 / initial_comments_shape[0] - 1,2),"% vs. original")
print("Number of records in posts df:", end_posts_shape[0])
print(" Reduction of", round((initial_posts_shape[0] - end_posts_shape[0]) * 100 / initial_posts_shape[0] - 1,2),"% vs. original")

Number of records in comments df: 912334
 Reduction of 79.17 % vs. original
Number of records in posts df: 36416
 Reduction of 93.14 % vs. original


In [30]:
# Export files
comments.to_csv(data_path + 'comments_filtered.csv', index=False)
posts.to_csv(data_path + 'posts_filtered.csv', index=False)