# Web Scraping Notebook

---

### Import Modules & Read in Data Frame

In [1]:
import json
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup

In [2]:
df = pd.read_csv('/Users/kevinmacmat/Documents/flatiron/module_projects/capstone/csv/sqr_no_comments.csv')

---

### Data Base Number (DBN) List

Create a list of all DBN's to pass into end of https://insideschools.org/school/ url. 

In [3]:
dbn_list = list(df.dbn)

Adjust dbn_list range in order to scrape and save in batches. Ran into issues when trying to scrape too much at one time. 

In [4]:
dbn_list = dbn_list[:3]

---

### Scrape

Use Selenium's headless mode option so browser does not continually open with every school's website. Must set a path to Selenium's downloaded chromedriver in order to function properly. 

In [5]:
# Create an instance of ChromeOptions
options = webdriver.ChromeOptions()
# Run headless mode 
options.add_argument("headless")
# Instatiate chrome driver and pass in the file path to chromedriver
driver = webdriver.Chrome('/Users/kevinmacmat/Documents/flatiron/module_projects/capstone/chromedriver', options=options)

Get comments for past 6 years and output them to output_list. The 6 year cutoff was determined due to the SQR's availability for those years.

In [6]:
output_list = []

In [7]:
# Instantiate containers for comment features
raw_message = []
depth = []
dislikes = []
likes = []
name = []
dbns = []

for dbn in dbn_list:
    # Get website 
    driver.get('https://insideschools.org/school/' + dbn)
    # Switch to iframe containing script tag
    driver.switch_to.frame(1)
    # Grab the text
    text = driver.page_source
    # Switch out of iframe
    driver.switch_to.default_content()
    # Parse and process the source with BeautifulSoup module by creating an BS object
    soup = BeautifulSoup(text, 'lxml')
    # Access the soup and find the script element's id
    thread = soup.find("script", {"id": "disqus-threadData"})
    # Turn the bs4 tag into a string, remove the script tag, and access the json
    site_json = json.loads(str(thread)[48:-9])
    # Navigate and loop json, filtering comments by date, to append comments to comments_list
    for comment in site_json['response']['posts']:
        if '2014' in comment['createdAt']:
            raw_message.append(comment['raw_message'])
            depth.append(comment['depth'])
            dislikes.append(comment['dislikes'])
            likes.append(comment['likes'])
            name.append(comment['author']['name'])
            dbns.append(dbn)
        elif '2015' in comment['createdAt']:
            raw_message.append(comment['raw_message'])
            depth.append(comment['depth'])
            dislikes.append(comment['dislikes'])
            likes.append(comment['likes'])
            name.append(comment['author']['name'])
            dbns.append(dbn)
        elif '2016' in comment['createdAt']:
            raw_message.append(comment['raw_message'])
            depth.append(comment['depth'])
            dislikes.append(comment['dislikes'])
            likes.append(comment['likes'])
            name.append(comment['author']['name'])
            dbns.append(dbn)
        elif '2017' in comment['createdAt']:
            raw_message.append(comment['raw_message'])
            depth.append(comment['depth'])
            dislikes.append(comment['dislikes'])
            likes.append(comment['likes'])
            name.append(comment['author']['name'])
            dbns.append(dbn)
        elif '2018' in comment['createdAt']:
            raw_message.append(comment['raw_message'])
            depth.append(comment['depth'])
            dislikes.append(comment['dislikes'])
            likes.append(comment['likes'])
            name.append(comment['author']['name'])
            dbns.append(dbn)
        elif '2019' in comment['createdAt']:
            raw_message.append(comment['raw_message'])
            depth.append(comment['depth'])
            dislikes.append(comment['dislikes'])
            likes.append(comment['likes'])
            name.append(comment['author']['name'])
            dbns.append(dbn)
        elif '2020' in comment['createdAt']:
            raw_message.append(comment['raw_message'])
            depth.append(comment['depth'])
            dislikes.append(comment['dislikes'])
            likes.append(comment['likes'])
            name.append(comment['author']['name'])
            dbns.append(dbn)
        else:
            continue

In [16]:
print(len(dbns))
print(len(name))
print(len(raw_message))
print(len(likes))
print(len(dislikes))
print(len(depth))

26
26
26
26
26
26


In [None]:
# Append data to output_list
output_list.append(dbns)
output_list.append(name)
output_list.append(raw_message)
output_list.append(likes)
output_list.append(dislikes)
output_list.append(depth)

---

### Make Data Frame

In [17]:
batch_df = pd.DataFrame()
batch_df['dbn'] = dbns
batch_df['username'] = name
batch_df['comment'] = raw_message
batch_df['likes'] = likes
batch_df['dislikes'] = dislikes
batch_df['replies'] = depth

In [18]:
batch_df

Unnamed: 0,dbn,username,comment,likes,dislikes,replies
0,01M015,P.S. 15 Parent,P.S. 15 is an extraordinary small school that ...,0,0,0
1,01M015,Houleye Sy,A Hidden Gem!\nAmazing community school that f...,0,0,0
2,01M015,newslink,PS 15 is among the schools with the most impro...,0,0,0
3,01M015,newslink,PS 15 second graders won a city-wide ferry nam...,0,0,0
4,01M015,newslink,NYC Department of Education officials barred a...,0,0,0
5,01M015,Newslink,PS 15 is poised to benefit from District 1's p...,0,0,0
6,01M015,newslink,PS 15 Principal Irene Sanchez supports the Dis...,0,0,1
7,01M015,newslink,District 1 has proposed a plan to attract more...,0,0,0
8,01M015,newslink,As public officials urge more schools to addre...,0,0,0
9,01M015,newslink,“When you have a mix of students and everyone’...,0,0,0


In [None]:
# Create dataframe with list of data base numbers
batch_df = pd.DataFrame(dbn_list, columns=['dbn'])

In [None]:
# Add comments of recently scraped batch of comments to comments column 
batch_df['comments'] = output_list

---

### Convert data frame to CSV and export

In [None]:
batch_df.to_csv('batch_1000-end_comments.csv', index=False)