# Web Scraping Notebook

---

### Import Modules & Read in Data Frame

In [1]:
import json
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup

In [2]:
df = pd.read_csv('/Users/kevinmacmat/Documents/flatiron/module_projects/capstone/csv/sqr_no_comments.csv')

---

### Data Base Number (DBN) & Features List

Create a list of all DBN's to loop over and pass into end of https://insideschools.org/school/ url. Also instantiate lists for all relevant SQR features to add to the database. 

In [3]:
dbn_list = list(df.dbn)
grade_level_list = list(df.school_type)
enrollment_list = list(df.enrollment)
fam_comm_ties_list = list(df.fam_comm_ties_rating)
pct_ell_list = list(df.pct_ell)
pct_disabilities_list = list(df.pct_disabilities)
pct_self_contained_list = list(df.pct_self_contained)
economic_need_index_list = list(df.economic_need_index)
pct_temp_housing_list = list(df.pct_temp_housing)
pct_hra_eligible_list = list(df.pct_hra_eligible)
pct_asian_list = list(df.pct_asian)
pct_black_list = list(df.pct_black)
pct_hispanic_list = list(df.pct_hispanic)
pct_white_list = list(df.pct_white)
pct_chronic_absent_list = list(df.pct_chronic_absent)
borough_list = list(df.borough)

Adjust dbn_list scraping range in order to scrape/pull info from sqr data and insideschools.org, and save in batches. Ran into issues when trying to scrape too much at one time. 

In [4]:
# Adjust range to scrape from 1076 dbn entries below
scrape_range_start = 925
# scrape_range_stop = 950
scrape_range_stop = 1076

dbn_list = dbn_list[scrape_range_start:scrape_range_stop]
grade_level_list = grade_level_list[scrape_range_start:scrape_range_stop]
enrollment_list = enrollment_list[scrape_range_start:scrape_range_stop]
fam_comm_ties_list = fam_comm_ties_list[scrape_range_start:scrape_range_stop]
pct_ell_list = pct_ell_list[scrape_range_start:scrape_range_stop]
pct_disabilities_list = pct_disabilities_list[scrape_range_start:scrape_range_stop]
pct_self_contained_list = pct_self_contained_list[scrape_range_start:scrape_range_stop]
ecomonic_need_index_list = economic_need_index_list[scrape_range_start:scrape_range_stop]
pct_temp_housing_list = pct_temp_housing_list[scrape_range_start:scrape_range_stop]
pct_hra_eligible_list = pct_hra_eligible_list[scrape_range_start:scrape_range_stop]
pct_asian_list = pct_asian_list[scrape_range_start:scrape_range_stop]
pct_black_list = pct_black_list[scrape_range_start:scrape_range_stop]
pct_hispanic_list = pct_hispanic_list[scrape_range_start:scrape_range_stop]
pct_white_list = pct_white_list[scrape_range_start:scrape_range_stop]
pct_chronic_absent_list = pct_chronic_absent_list[scrape_range_start:scrape_range_stop]
borough_list = borough_list[scrape_range_start:scrape_range_stop]

---

### Scrape

Use Selenium's headless mode option so browser does not continually open with every school's website. Must set a path to Selenium's downloaded chromedriver in order to function properly. 

In [5]:
# Create an instance of ChromeOptions
options = webdriver.ChromeOptions()
# Run headless mode 
options.add_argument("headless")
# Instatiate chrome driver and pass in the file path to chromedriver
driver = webdriver.Chrome('/Users/kevinmacmat/Documents/flatiron/module_projects/capstone/chromedriver', options=options) 

Get comments and other metadata for past 6 years and output them to list-containers to make dataframe. The 6 year cutoff was determined due to the SQR's availability for those years.

In [6]:
# Instantiate containers for comment features
raw_message = []
depth = []
dislikes = []
likes = []
name = []
dbns = []
post_date = []
borough = []
grade_level = []
enrollment = []
fam_comm_ties = []
pct_ell = []
pct_disabilities = []
pct_self_contained = []
economic_need_index = []
pct_temp_housing = []
pct_hra_eligible = []
pct_asian = []
pct_black = []
pct_hispanic = []
pct_white = []
pct_chronic_absent = []

for index, dbn in enumerate(dbn_list):
    # Get website 
    driver.get('https://insideschools.org/school/' + dbn)
    # Switch to iframe containing script tag
    driver.switch_to.frame(1)
    # Grab the text
    text = driver.page_source
    # Switch out of iframe
    driver.switch_to.default_content()
    # Parse and process the source with BeautifulSoup module by creating an BS object
    soup = BeautifulSoup(text, 'lxml')
    # Access the soup and find the script element's id
    thread = soup.find("script", {"id": "disqus-threadData"})
    # Turn the bs4 tag into a string, remove the script tag, and access the json
#     json_string = str(thread)[49:-9]
    json_string = str(thread)[48:-9]
    site_json = json.loads(json_string)
    # Navigate and loop json, filtering comments by date, to append comments to comments_list
    for comment in site_json['response']['posts']:
        if '2014' or '2015' or '2016' or '2017' or '2018' or '2019' or '2020' in comment['createdAt']:
            raw_message.append(comment['raw_message'])
            depth.append(comment['depth'])
            dislikes.append(comment['dislikes'])
            likes.append(comment['likes'])
            name.append(comment['author']['name'])
            post_date.append(comment['createdAt'])
            dbns.append(dbn)
            borough.append(borough_list[index])
            grade_level.append(grade_level_list[index])
            enrollment.append(enrollment_list[index])
            fam_comm_ties.append(fam_comm_ties_list[index])
            pct_ell.append(pct_ell_list[index])
            pct_disabilities.append(pct_disabilities_list[index])
            pct_self_contained.append(pct_self_contained_list[index])
            economic_need_index.append(economic_need_index_list[index])
            pct_temp_housing.append(pct_temp_housing_list[index])
            pct_hra_eligible.append(pct_hra_eligible_list[index])
            pct_asian.append(pct_asian_list[index])
            pct_black.append(pct_black_list[index])
            pct_hispanic.append(pct_hispanic_list[index])
            pct_white.append(pct_white_list[index])
            pct_chronic_absent.append(pct_chronic_absent_list[index])
        else:
            continue

In [11]:
# Check that all lists are of the same length before combining into dataframe
print(len(dbns))
print(len(name))
print(len(raw_message))
print(len(likes))
print(len(dislikes))
print(len(depth))
print(len(post_date))
print(len(borough))
print(len(grade_level))
print(len(enrollment))
print(len(fam_comm_ties))
print(len(pct_ell))
print(len(pct_disabilities))
print(len(pct_self_contained))
print(len(economic_need_index))
print(len(pct_temp_housing))
print(len(pct_hra_eligible))
print(len(pct_asian))
print(len(pct_black))
print(len(pct_hispanic))
print(len(pct_white))
print(len(pct_chronic_absent))

2257
2257
2257
2257
2257
2257
2257
2257
2257
2257
2257
2257
2257
2257
2257
2257
2257
2257
2257
2257
2257
2257


---

### Combine Lists To Make Data Frame

In [12]:
batch_df = pd.DataFrame()
batch_df['dbn'] = dbns
batch_df['username'] = name
batch_df['borough'] = borough
batch_df['grade_level'] = grade_level
batch_df['enrollment'] = enrollment
batch_df['comment'] = raw_message
batch_df['likes'] = likes
batch_df['dislikes'] = dislikes
batch_df['replies'] = depth
batch_df['post_date'] = post_date
batch_df['fam_comm_ties'] = fam_comm_ties
batch_df['pct_ell'] = pct_ell
batch_df['pct_disabilities'] = pct_disabilities
batch_df['pct_self_contained'] = pct_self_contained
batch_df['economic_need_index'] = economic_need_index
batch_df['pct_temp_housing'] = pct_temp_housing
batch_df['pct_hra_eligible'] = pct_hra_eligible
batch_df['pct_asian'] = pct_asian
batch_df['pct_black'] = pct_black
batch_df['pct_hispanic'] = pct_hispanic
batch_df['pct_white'] = pct_white
batch_df['pct_chronic_absent'] = pct_chronic_absent

In [13]:
batch_df.shape

(2257, 22)

In [14]:
batch_df.tail()

Unnamed: 0,dbn,username,borough,grade_level,enrollment,comment,likes,dislikes,replies,post_date,...,pct_disabilities,pct_self_contained,economic_need_index,pct_temp_housing,pct_hra_eligible,pct_asian,pct_black,pct_hispanic,pct_white,pct_chronic_absent
2252,32K554,Anonymous,brooklyn,Middle,180,"<P>""Students gain leadership experience and co...",0,1,0,2005-06-24T20:40:59,...,0.072,0.0,0.948,0.061,0.522,0.083,0.067,0.694,0.15,0.033
2253,32K554,X.G,brooklyn,Middle,180,When are the open houses? I am an eith grader ...,0,0,1,2016-10-19T00:59:06,...,0.072,0.0,0.948,0.061,0.522,0.083,0.067,0.694,0.15,0.033
2254,32K562,Bob,brooklyn,Middle,370,"Subbed at this school Twice, admins are rude, ...",0,0,0,2019-12-04T18:02:31,...,0.23,0.057,0.877,0.246,0.843,0.008,0.219,0.754,0.019,0.138
2255,32K562,Your mad,brooklyn,Middle,370,I’m his school is very good you are just mad b...,0,0,1,2020-03-09T22:35:37,...,0.23,0.057,0.877,0.246,0.843,0.008,0.219,0.754,0.019,0.138
2256,32K562,Guest,brooklyn,Middle,370,"The school is located in Bushwick, not East NY",1,0,0,2013-07-01T20:13:44,...,0.23,0.057,0.877,0.246,0.843,0.008,0.219,0.754,0.019,0.138


---

### Convert data frame to CSV and export

In [20]:
batch_df.to_csv('925-1076_comments.csv', index=False)

In [16]:
# batch_df.loc[(batch_df['dbn'] == '01M019') & (batch_df['replies'] > 0)].iloc[1][2]

In [17]:
# batch_df.comment.iloc[0]

In [18]:
# batch_df.comment.str.len().max

In [19]:
# # Number of comments per school
# batch_df.dbn.value_counts()