### Requirements
- List on imdb of shows you wish to scrape // make model out of
- List of those shows and your personal ratings for them in a CSV (use export function on imdb to get precise titles)

In [32]:
## import relevant packages
import pandas as pd
import urllib.request as urllib
import requests
import numpy as np
from bs4 import BeautifulSoup
from sklearn.preprocessing import OneHotEncoder

In [33]:
## assign imdb list of shows to variable
tv_list_url = 'https://www.imdb.com/list/ls088122474/'

In [34]:
## read imdb url of list of shows
html = urllib.urlopen(tv_list_url)
soup = BeautifulSoup(html.read())

## select the actual list of shows in url
lister = soup.find("div", {"class": "lister"})
items = lister.find_all("div", {"class":"lister-item-content"})

## add show page urls to a seperate list
a_tags = []
for i in items:
    a_tags.append(i.find("a")['href'])


In [35]:
## declare empty lists for all the data that will be collected
ratings = []
rating_counts = []
titles = []
level = []
length = []
genre_1 = []
genre_2 = []
genre_3 = []
start_years = []
episodes = []
top_ep = []
sec_ep = []
thd_ep = []
popularity = []
awards_1 = []
awards_2 = []

## loop through each url in the show page url list
for url in a_tags:
    ## read the url
    show = 'https://www.imdb.com' + url
    html = urllib.urlopen(show)
    soup = BeautifulSoup(html.read())
    
    ## locate specific parts of the page that will be scraped
    title_bar = soup.find("div", {"class":"title_bar_wrapper"})
    subtext = soup.find("div", {"class":"subtext"})
    article = soup.find("div", {"id":"titleDetails"})
    mini_article = soup.find("div", {"id":"top-rated-episodes-rhs"})
    review_bar = soup.find("div", {"class":"titleReviewBarSubItem"})
    awards_bar = soup.find("div", {"id":"titleAwardsRanks"})

    ## scrape title bar for ratings
    if title_bar.find("span",{"itemprop":"ratingValue"}) is not None:
        ratings.append(float(title_bar.find("span",{"itemprop":"ratingValue"}).text))
    else:
        ratings.append("N/A")
        
    ## scrape title bar for rating counts
    if title_bar.find("span",{"itemprop":"ratingCount"}) is not None:
        rating_counts.append(int(title_bar.find("span",{"itemprop":"ratingCount"}).text.replace(",","")))
    else:
        rating_counts.append("N/A")

    ## scrape show title from page
    titles.append(title_bar.h1.text.strip())

    ## scrape title bar for TV rating 
    if title_bar.find("div",{"class":"subtext"}) is not None:
        if title_bar.find("div",{"class":"subtext"}).text.split("\n")[1].strip() != "":
            level.append(title_bar.find("div",{"class":"subtext"}).text.split("\n")[1].strip())
        else:
            level.append("N/A")
    else:
        level.append("N/A")

    ## declare new genre list (reset for every show)
    genre_list = []
    
    ## add all genres on page into genre list 
    for i in range(0, len(subtext.find_all("a")) - 1):
        genre_list.append(subtext.find_all("a")[i].text)

    ## if show has a genre, add it as genre #1
    if len(genre_list) > 0:
        genre_1.append(genre_list[0])
    else:
        genre_1.append("N/A")
        
    ## if show has more than one genre, add it as genre #2
    if len(genre_list) > 1:
        genre_2.append(genre_list[1])
    else:
        genre_2.append("N/A")
    
    ## if show has more than two genres, add it as genre #3
    if len(genre_list) > 2:
        genre_3.append(genre_list[2])
    else:
        genre_3.append("N/A")

    ## add the start year of show from page (cleaned)
    raw_start = subtext.find_all("a")[len(subtext.find_all("a")) - 1]
    start_years.append(int(raw_start.text.split("(")[1].split("–")[0].replace(")\n","")))

    ## scrape number of episodes from episodes heading
    if soup.find("a",{"class":"bp_item"}) is not None:
        section = soup.find("a",{"class":"bp_item"})
        episodes.append(int(section.find("span",{"class":"bp_sub_heading"}).text.split()[0]))
    else:
        episodes.append("N/A")
        
    ## scrape length of episode from bottom info section
    if article.find("time") is not None:
        length.append(int(article.find("time").text.split()[0]))
    elif title_bar.find("time") is not None:
        length.append(int(title_bar.find("time").text.split("\n")[1].strip()[0:2]))
    else:
        length.append(None)

    ## scrape top three episode ratings from right mini article bar
    if mini_article is not None:
        top_eps = mini_article.find_all("span",{"class":"ipl-rating-star__rating"})
        top_ep.append(top_eps[0].text)
        sec_ep.append(top_eps[23].text)
        thd_ep.append(top_eps[46].text)
    else:
        top_ep.append(None)
        sec_ep.append(None)
        thd_ep.append(None)

    ## scrape popularity from review bar
    if review_bar is not None:
        popularity.append(int(review_bar.find("span", {"class": "subText"}).text.split()[0].replace(",","")))
    else:
        popularity.append(4000)

    ## scrape awards from awards bar
    if awards_bar is not None:
        blurbs = awards_bar.find_all("span",{"class":"awards-blurb"})
        if len(blurbs) > 1:
            awards_1.append(" ".join(blurbs[0].text.split()))
            awards_2.append(" ".join(blurbs[1].text.split()))
        elif len(blurbs) > 0:
            awards_1.append("N/A")
            awards_2.append(" ".join(blurbs[0].text.split()))
        else:
            awards_1.append("N/A")
            awards_2.append("N/A")
            
    else:
        awards_1.append("N/A")
        awards_2.append("N/A")

In [36]:
## create shows dataframe and assign lists of scraped data to dataframe columns
shows = pd.DataFrame()
shows["title"] = titles
shows["rating"] = ratings
shows["rating_count"] = rating_counts
shows["length"] = length
shows["level"] = level
shows["genre_1"] = genre_1
shows["genre_2"] = genre_2
shows["genre_3"] = genre_3
shows["start_year"] = start_years
shows["episodes"] = episodes
shows["top_ep"] = top_ep
shows["sec_ep"] = sec_ep
shows["thd_ep"] = thd_ep
shows["popularity"] = popularity
shows["awards_1"] = awards_1
shows["awards_2"] = awards_2

In [37]:
## make new dataframe with columns to be one hot encoded (categorical variables)
features = shows.iloc[:, 4:8]

In [38]:
## one hot encode the categorical data and put into a dataframe with column names
encoder = OneHotEncoder(sparse = False)
encoded_df = encoder.fit_transform(features)
features_df = pd.DataFrame(encoded_df, columns = encoder.get_feature_names())
features_df = features_df.astype(int)

In [39]:
## join the newly created columns to the original dataframe
shows = shows.join(features_df)

In [40]:
## drop shows with lack of top episode data
shows = shows.dropna(subset=['top_ep', 'length'])

In [41]:
## this cell exists to aggregate the variants in genres created by one hot encoding
## into their parent genres. For instance, "x1_Action" and "x2_Action"
## are being combined into one Action variable.

## create list of genres
genre_list = ['Action','Adventure','Animation','Biography','Comedy','Crime','Drama',
              'Family','Fantasy','History','Horror','Mystery','Romance','Short','Sport','Music',
              'Sci-Fi','Thriller']

## create lists for each genre that will become columns in dataframe
Action = []
Adventure = []
Animation = []
Biography = []
Comedy = []
Crime = []
Drama = []
Family = []
Fantasy = []
History = []
Horror = []
Mystery = []
Romance = []
Short = []
Sport = []
Music = []
Sci_Fi = []
Thriller = []


## essentially this code is consolidating the variants of each genre column created by
## one hot encoding into one parent column. For instance, "x1_Action" and "x2_Action"
## are being combined into one Action variable.

## iterate through each genre and check the list of columns to find the columns containing
## it. for each row in the dataframe, check each one of these columns and append a 0 or 1,
## depending on if the show is of that genre.
for genre in genre_list:
    column_indicies = []
    for i in range(0,len(shows.columns)):
        if genre in shows.columns[i]:
            column_indicies.append(i)
    for x in shows.iterrows():
        boolean_addition = 0;
        for index in column_indicies:
            if x[1][index] == 1:
                boolean_addition = 1;
        if genre == 'Action':
            Action.append(boolean_addition)
        if genre == 'Adventure':
            Adventure.append(boolean_addition)
        if genre == 'Animation':
            Animation.append(boolean_addition)
        if genre == 'Biography':
            Biography.append(boolean_addition)
        if genre == 'Comedy':
            Comedy.append(boolean_addition)
        if genre == 'Crime':
            Crime.append(boolean_addition)
        if genre == 'Drama':
            Drama.append(boolean_addition)
        if genre == 'Family':
            Family.append(boolean_addition)
        if genre == 'Fantasy':
            Fantasy.append(boolean_addition)
        if genre == 'History':
            History.append(boolean_addition)
        if genre == 'Horror':
            Horror.append(boolean_addition)
        if genre == 'Mystery':
            Mystery.append(boolean_addition)
        if genre == 'Romance':
            Romance.append(boolean_addition)
        if genre == 'Short':
            Short.append(boolean_addition)
        if genre == 'Sport':
            Sport.append(boolean_addition)
        if genre == 'Music':
            Music.append(boolean_addition)
        if genre == 'Sci-Fi':
            Sci_Fi.append(boolean_addition)
        if genre == 'Thriller':
            Thriller.append(boolean_addition)

In [42]:
## manually bin the episode length data into five bins
shows['quick'] = np.where(shows['length']<20, 1, 0)
shows['short'] = np.where((shows['length']>19.9) & (shows['length']<35), 1, 0)
shows['medium'] = np.where((shows['length']>34.9) & (shows['length']<45), 1, 0)
shows['long'] = np.where((shows['length']>44.9) & (shows['length']<65), 1, 0)
shows['series'] = np.where(shows['length']>64.9, 1, 0)

## group the one hot encoded genres into their true genre
shows['Action'] = Action
shows['Adventure'] = Adventure
shows['Animation'] = Animation
shows['Biography'] = Biography
shows['Comedy'] = Comedy
shows['Crime'] = Crime
shows['Drama'] = Drama
shows['Family'] = Family
shows['Fantasy'] = Fantasy
shows['History'] = History
shows['Horror'] = Horror
shows['Mystery'] = Mystery
shows['Romance'] = Romance
shows['Short'] = Short
shows['Sport'] = Sport
shows['Music'] = Music
shows['Sci-Fi'] = Sci_Fi
shows['Thriller'] = Thriller

In [43]:
## SKIP STEP IF YOU'RE NOT INTERESTED IN MODEL MAKING
## import csv of user-rated scores for each of shows
target = pd.read_csv('curtis.csv', encoding='utf-8')

In [44]:
## SKIP STEP IF YOU'RE NOT INTERESTED IN MODEL MAKING
## merge shows dataframe with dataframe of user inputted ratings
shows.title.astype(str)
target.title.astype(str)
shows = shows.merge(target, on='title')

In [30]:
## export final data to dataframe
shows.to_csv('test.csv', encoding='utf-8-sig')

In [45]:
shows.corr()

Unnamed: 0,rating,rating_count,length,start_year,episodes,popularity,x0_TV-14,x0_TV-MA,x0_TV-PG,x1_Action,...,History,Horror,Mystery,Romance,Short,Sport,Music,Sci-Fi,Thriller,score
rating,1.0,0.763352,-0.098745,-0.604506,0.088964,-0.513647,-0.216833,0.325898,-0.273517,-0.038988,...,0.0098,-0.057819,-0.393076,,,0.131342,,-0.121841,0.408317,0.562493
rating_count,0.763352,1.0,-0.110168,-0.514823,0.102143,-0.446598,-0.044102,0.09718,-0.129569,0.071075,...,-0.185978,0.070518,-0.199776,,,-0.029387,,-0.179604,0.306604,0.404255
length,-0.098745,-0.110168,1.0,0.4564,-0.364342,0.033471,-0.233771,0.249792,-0.051824,-0.156789,...,-0.075015,0.510664,0.202093,,,0.586041,,-0.154471,-0.106826,0.183285
start_year,-0.604506,-0.514823,0.4564,1.0,-0.645602,0.305265,-0.264584,0.177647,0.192795,0.193999,...,-0.008945,0.27907,0.119407,,,0.306495,,0.026885,-0.269408,-0.355908
episodes,0.088964,0.102143,-0.364342,-0.645602,1.0,-0.357622,0.452908,-0.399257,-0.102275,-0.166394,...,-0.151597,-0.208465,-0.001703,,,-0.215226,,-0.138714,-0.072791,0.164406
popularity,-0.513647,-0.446598,0.033471,0.305265,-0.357622,1.0,-0.24256,0.166293,0.168531,0.144862,...,0.184082,0.007254,-0.019245,,,-0.145488,,0.287872,-0.268002,-0.571281
x0_TV-14,-0.216833,-0.044102,-0.233771,-0.264584,0.452908,-0.24256,1.0,-0.91084,-0.1557,-0.225969,...,-0.225374,0.098601,0.508923,,,-0.1557,,-0.163583,-0.282843,-0.047856
x0_TV-MA,0.325898,0.09718,0.249792,0.177647,-0.399257,0.166293,-0.91084,1.0,-0.265908,0.273449,...,0.247436,-0.068732,-0.437733,,,0.170941,,0.206593,0.31053,0.043589
x0_TV-PG,-0.273517,-0.129569,-0.051824,0.192795,-0.102275,0.168531,-0.1557,-0.265908,1.0,-0.12666,...,-0.065795,-0.065795,-0.141019,,,-0.045455,,-0.112367,-0.082572,0.007451
x1_Action,-0.038988,0.071075,-0.156789,0.193999,-0.166394,0.144862,-0.225969,0.273449,-0.12666,1.0,...,0.168061,-0.18334,-0.177764,,,-0.12666,,0.166993,-0.230089,-0.361271
