# Project 3 - Quantifying TV Laughter: A Data-Backed Guide for Brooklyn Nine-Nine and Big Bang Theory Investment

### Contents:
- [Background](#Background)
- [Scraping reddit using API](#Scraping-Reddit-using-API)
- [Data Import & Cleaning](#Data-Import-and-Cleaning)
- [Exploratory Data Analysis](#Exploratory-Data-Analysis)

# Background

# Scraping reddit using API

In [1]:
from pprint import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import time
import random
import os
import ipywidgets as widgets
from IPython.display import display


brooklynninenine has 113 columns while bigbangtheory has 11 columns. For both datasets, we will only extract the columns present in bigbangtheory.
['title', 'selftext', 'author_flair_text', 'link_flair_text', 'score', 'upvote_ratio', 'distinguished', 'is_original_content', 'is_self', 'num_comments', 'subreddit']
    
FYI: Initially I wanted to extract these cols for b99:
- #subreddit
- #title
- #selftext
- #link_flair_css_class (Eg. meme, discussion etc)
- #upvote_ratio
- #ups
- #link_flair_text (Eg. Humour, Disucssion etc)
- #post_hint (image, NaN)
- #author
- #num_comments
- #subreddit_subscribers

In [2]:
def scrape(subreddit_name, limit_val):
    url = 'https://www.reddit.com/r/{}.json'.format(subreddit_name)

    posts = []
    after = None
    
    # Setting file paths and folders to save the final dataframes in:
    # Construct the file path
    current_directory = os.getcwd()  # Get the current working directory
    target_directory = os.path.join(current_directory, '..', 'data')
    # Create the target directory if it doesn't exist
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)
    # Construct the file path within the target directory
    file_path = os.path.join(target_directory, '{}.csv'.format(subreddit_name))

    col_to_save = ['title', 'selftext', 'author_flair_text', 'link_flair_text', 'score', 'upvote_ratio', 'distinguished', 
                   'is_original_content', 'is_self', 'num_comments', 'subreddit']
    
    # Setting the number of reviews to scrape (Each scrape only gives 25 rows)
    range_val = int(limit_val/25)
    
    # Create a progress bar widget
    print(subreddit_name)
    progress_bar = widgets.IntProgress(min=0, max=range_val, value=0)

    # Display the progress bar
    #display(progress_bar)
    
    for a in range(range_val):
        if after == None:
            current_url = url
        else:
            current_url = url + '?after=' + after
        res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})

        if res.status_code != 200:
            print('Status error', res.status_code)
            break

        current_dict = res.json()
        current_posts = [p['data'] for p in current_dict['data']['children']]
        posts.extend(current_posts)
        after = current_dict['data']['after']

        if a > 0:
            prev_posts = pd.read_csv(file_path)
            current_df = pd.DataFrame(current_posts)
            combined = pd.concat([prev_posts, current_df])
            df = pd.DataFrame(combined)
            df.reset_index(inplace=True)
            df[col_to_save].to_csv(file_path, index = False)
        else:
            df = pd.DataFrame(posts)
            df[col_to_save].to_csv(file_path, index = False)

        # generate a random sleep duration to look more 'natural'
        sleep_duration = random.randint(2,6)

        # Update the progress bar value
        progress_bar.value = a+1

In [3]:
# Uncomment the following to scrape data again

#scrape('dataanalyst', 1500)

# Data Import & Cleaning

In [4]:
current_directory = os.getcwd()
file_path = os.path.join(current_directory, '../data/brooklynninenine_hot.csv')
df_b99 = pd.read_csv(file_path)
file_path = os.path.join(current_directory, '../data/bigbangtheory_hot.csv')
df_bbt = pd.read_csv(file_path)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\LENOVO YOGA CORE I5\\Documents\\2023\\1 Data Science Immersive\\Projects\\project-3-quantifying-laughter\\code\\../data/brooklynninenine_hot.csv'

In [None]:
df_b99.info()
display(df_b99.head())

In [None]:
df_bbt.info()
display(df_bbt.head())

In [None]:
# Merging title and selftext into 'posts' column, and only using this data.

# Replace NaN values with empty strings, if not it cannot be merged together
df_bbt['selftext'] = df_bbt['selftext'].fillna('')
df_bbt['posts'] = df_bbt['title'] + ' ' + df_bbt['selftext']
display(df_bbt.head(5))

df_b99['selftext'] = df_b99['selftext'].fillna('')
df_b99['posts'] = df_b99['title'] + ' ' + df_b99['selftext']
display(df_b99.head(5))

In [None]:
# Preparing one final dataset to use for EDA
df = pd.concat([df_bbt[['posts', 'subreddit']], df_b99[['posts', 'subreddit']]]).reset_index()

In [None]:
df.groupby('subreddit').count()

# Exploratory Data Analysis

Interesting things in the posts: 
- !
- Capital letters / all uppercase
- emojis (how to detect emojis?)