# Project 3: Web API and NLP Data Importing (Universal Studios)

## Libraries Importing

In [28]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from time import sleep
import json, os


## Instantiate webdriver

In [35]:
## instantiate driver
## check the version of Google Chrome and download correct version of chromedriver
driver = webdriver.Chrome()

In [36]:
## get page of "social grep", which gived old posts of subreddit
## original reddit url = 'https://www.reddit.com/r/Disneyland/'

subreddit = 'universalstudios' # choose by yourself
start_date = '2010-01-01' # choose by yourself

url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{start_date}&order_by=oldest'

driver.get(url)
repeat_time, waiting_time = 4, 2

## scroll to the bottom of the page and wait
for i in range(repeat_time):
    driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight);")
    sleep(waiting_time)

## Example of one post

In [31]:
## function to scrape
def get_content(post, subreddit):
    try:
        vote = int(post.select_one('span.text-info').text)
    except:
        vote = 0
    try:
        title = post.a.text
    except:
        return None
    try:
        text = post.select_one('div.post_content').get_text(separator='\n').strip()
        if text == '':
            text = None
    except:
        text = None
    date = post.select_one('h6.card-subtitle').text.split(',')[1].strip()

    if text == None and title == f"/r/{subreddit.lower()}":
        return None
    else:
        return {
            "vote" : vote,
            "title" : title,
            "text" : text,
            "date" : date
        }

In [37]:
soup = BeautifulSoup(driver.page_source)
posts = soup.select('div.card-body') # content is under here

get_content(posts[1], subreddit) # show one example

{'vote': 2,
 'title': 'IOACentral, what happened to you?',
 'text': None,
 'date': '2011-07-22'}

# For loop with datetime

In [38]:
if os.path.exists(f'{subreddit}.json'):
    ## resume scraping from the last date in the json file
    with open(f'{subreddit}.json', 'r') as f:
        scraped_data = json.load(f)
    new_date = scraped_data[-1]['date']
    url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{new_date}&order_by=oldest'
else:
    ## if the file not exists, create a new list
    scraped_data = []

In [39]:
## scrape and append to `scraped_data`
## RUN THIS CELL AGAIN AND AGAIN until getting the latest post

for _ in tqdm(range(800)): # set repeat time 

    ## scroll to the bottom of the page and wait
    driver.get(url)
    for i in range(4):
        driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight);")
        sleep(2)

    ## get HTML
    soup = BeautifulSoup(driver.page_source)
    posts = soup.select('div.card-body')

    ## iterate each post
    for post in posts:
        one_post_dict = get_content(post, subreddit)
        if one_post_dict != None:
            scraped_data.append(one_post_dict)

    ## save to json
    with open(f'{subreddit}.json', 'w') as f:
        json.dump(scraped_data, f, indent=False, ensure_ascii=False)

    ## set new date
    new_date = scraped_data[-1]['date']
    url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{new_date}&order_by=oldest'


  8%|▊         | 62/800 [14:49<2:56:26, 14.34s/it]


KeyboardInterrupt: 

## To dataframe and drop duplicate

In [21]:
df = pd.read_json(f'{subreddit}.json').drop_duplicates()
df

Unnamed: 0,vote,title,text,date
0,0,Harry Potter premiere guests get free access t...,,2011-07-22
1,2,"IOACentral, what happened to you?",,2011-07-22
2,2,"My First Trip to Halloween Horror Nights, Orla...",Any tips from seasoned pros for my first trip ...,2011-08-17
3,2,Halloween Horror Nights 2011 at Universal Orla...,,2011-09-09
4,0,Hollywood Blvd Tour – Beverly Hills Tours and ...,,2013-05-18
...,...,...,...,...
3993,1,Anyone know when the Jurassic Park 30th annive...,I’m planning a at trip in the next two months ...,2023-06-21
3994,1,I like DreamWorks,[removed],2023-06-30
3995,0,Express Passes needed around Chirstmas time?,Planning to spend 2 days at USJ. I recently re...,2023-10-23
3996,2,Upgrading to a season pass at the park,Anyone know how much they tend to charge at th...,2023-11-03


In [22]:
## missing value in text
df.isna().sum()

vote        0
title       0
text     1657
date        0
dtype: int64

In [23]:
## text includes [removed] [deleted]
df[df['text'].isin(['[removed]', '[deleted]'])]

Unnamed: 0,vote,title,text,date
21,1,Reminder: Universal Orlando Resort’s Holidays ...,[deleted],2013-12-04
37,1,First Time Universal Studio Trip Planning,[deleted],2014-09-03
42,2,"annual pass holders, help me out.",[deleted],2014-09-22
43,0,Universal Studios Hollywood City Walk - Random...,[deleted],2014-09-30
70,2,The awkward moment...,[deleted],2015-03-23
...,...,...,...,...
3938,1,SNW TOP Coin Gain 5/14-5/31,[deleted],2023-06-01
3945,1,Toadstool reservations,[deleted],2023-06-02
3961,1,USJ Dilemma,[deleted],2023-06-09
3971,1,Trip Notes: 6/11/23,[deleted],2023-06-11


In [26]:
universal_df = df[(~df['text'].isin(['[removed]', '[deleted]'])) & (df['text'].notna())].drop_duplicates(subset=['text'])
universal_df.to_json('universal_dropped.json', orient='records', indent=True)
universal_df

Unnamed: 0,vote,title,text,date
2,2,"My First Trip to Halloween Horror Nights, Orla...",Any tips from seasoned pros for my first trip ...,2011-08-17
18,1,Why are the prices for tickets cheaper during ...,I was planning on going this year and it seems...,2013-10-23
24,2,Handicap Policy,My family has a trip planned to Universal Stud...,2014-03-15
25,7,Halloween Horror Nights 2014,Halloween Horror Nights 2014 information is st...,2014-03-24
27,1,Harry Potter soft opening? If I'm there when i...,My fiance and I are going to be at Universal f...,2014-05-13
...,...,...,...,...
3991,1,Im confused on how the neighbor pass actually ...,I live in texas and looking to go to Universal...,2023-06-19
3993,1,Anyone know when the Jurassic Park 30th annive...,I’m planning a at trip in the next two months ...,2023-06-21
3995,0,Express Passes needed around Chirstmas time?,Planning to spend 2 days at USJ. I recently re...,2023-10-23
3996,2,Upgrading to a season pass at the park,Anyone know how much they tend to charge at th...,2023-11-03


In [27]:
pd.read_json('universal_dropped.json')

Unnamed: 0,vote,title,text,date
0,2,"My First Trip to Halloween Horror Nights, Orla...",Any tips from seasoned pros for my first trip ...,2011-08-17
1,1,Why are the prices for tickets cheaper during ...,I was planning on going this year and it seems...,2013-10-23
2,2,Handicap Policy,My family has a trip planned to Universal Stud...,2014-03-15
3,7,Halloween Horror Nights 2014,Halloween Horror Nights 2014 information is st...,2014-03-24
4,1,Harry Potter soft opening? If I'm there when i...,My fiance and I are going to be at Universal f...,2014-05-13
...,...,...,...,...
1496,1,Im confused on how the neighbor pass actually ...,I live in texas and looking to go to Universal...,2023-06-19
1497,1,Anyone know when the Jurassic Park 30th annive...,I’m planning a at trip in the next two months ...,2023-06-21
1498,0,Express Passes needed around Chirstmas time?,Planning to spend 2 days at USJ. I recently re...,2023-10-23
1499,2,Upgrading to a season pass at the park,Anyone know how much they tend to charge at th...,2023-11-03
