In [1]:
## import relevant packages
import pandas as pd
import urllib.request as urllib
import requests
import numpy as np
from bs4 import BeautifulSoup
from sklearn.preprocessing import OneHotEncoder

In [2]:
## assign imdb list of shows to variable
tv_list_url = 'https://www.imdb.com/list/ls085347170/'

In [3]:
## read imdb url of list of shows
html = urllib.urlopen(tv_list_url)
soup = BeautifulSoup(html.read())

## select the actual list of shows in url
lister = soup.find("div", {"class": "lister"})
items = lister.find_all("div", {"class":"lister-item-content"})

## add show page urls to a seperate list
a_tags = []
for i in items:
    a_tags.append(i.find("a")['href'])


In [4]:
## declare empty lists for all the data that will be collected
ratings = []
rating_counts = []
titles = []
level = []
length = []
genre_1 = []
genre_2 = []
genre_3 = []
start_years = []
episodes = []
top_ep = []
sec_ep = []
thd_ep = []
popularity = []
awards_1 = []
awards_2 = []

## loop through each url in the show page url list
for url in a_tags:
    ## read the url
    show = 'https://www.imdb.com' + url
    html = urllib.urlopen(show)
    soup = BeautifulSoup(html.read())
    
    ## locate specific parts of the page that will be scraped
    title_bar = soup.find("div", {"class":"title_bar_wrapper"})
    subtext = soup.find("div", {"class":"subtext"})
    article = soup.find("div", {"id":"titleDetails"})
    mini_article = soup.find("div", {"id":"top-rated-episodes-rhs"})
    review_bar = soup.find("div", {"class":"titleReviewBarSubItem"})
    awards_bar = soup.find("div", {"id":"titleAwardsRanks"})

    ## scrape title bar for ratings
    if title_bar.find("span",{"itemprop":"ratingValue"}) is not None:
        ratings.append(float(title_bar.find("span",{"itemprop":"ratingValue"}).text))
    else:
        ratings.append("N/A")
        
    ## scrape title bar for rating counts
    if title_bar.find("span",{"itemprop":"ratingCount"}) is not None:
        rating_counts.append(int(title_bar.find("span",{"itemprop":"ratingCount"}).text.replace(",","")))
    else:
        rating_counts.append("N/A")

    ## scrape show title from page
    titles.append(title_bar.h1.text.strip())

    ## scrape title bar for TV rating 
    if title_bar.find("div",{"class":"subtext"}) is not None:
        if title_bar.find("div",{"class":"subtext"}).text.split("\n")[1].strip() != "":
            level.append(title_bar.find("div",{"class":"subtext"}).text.split("\n")[1].strip())
        else:
            level.append("N/A")
    else:
        level.append("N/A")

    ## declare new genre list (reset for every show)
    genre_list = []
    
    ## add all genres on page into genre list 
    for i in range(0, len(subtext.find_all("a")) - 1):
        genre_list.append(subtext.find_all("a")[i].text)

    ## if show has a genre, add it as genre #1
    if len(genre_list) > 0:
        genre_1.append(genre_list[0])
    else:
        genre_1.append("N/A")
        
    ## if show has more than one genre, add it as genre #2
    if len(genre_list) > 1:
        genre_2.append(genre_list[1])
    else:
        genre_2.append("N/A")
    
    ## if show has more than two genres, add it as genre #3
    if len(genre_list) > 2:
        genre_3.append(genre_list[2])
    else:
        genre_3.append("N/A")

    ## add the start year of show from page (cleaned)
    raw_start = subtext.find_all("a")[len(subtext.find_all("a")) - 1]
    start_years.append(int(raw_start.text.split("(")[1].split("–")[0].replace(")\n","")))

    ## scrape number of episodes from episodes heading
    if soup.find("a",{"class":"bp_item"}) is not None:
        section = soup.find("a",{"class":"bp_item"})
        episodes.append(int(section.find("span",{"class":"bp_sub_heading"}).text.split()[0]))
    else:
        episodes.append("N/A")
        
    ## scrape length of episode from bottom info section
    if article.find("time") is not None:
        length.append(int(article.find("time").text.split()[0]))
    elif title_bar.find("time") is not None:
        length.append(int(title_bar.find("time").text.split("\n")[1].strip()[0:2]))
    else:
        length.append("N/A")

    ## scrape top three episode ratings from right mini article bar
    if mini_article is not None:
        top_eps = mini_article.find_all("span",{"class":"ipl-rating-star__rating"})
        top_ep.append(top_eps[0].text)
        sec_ep.append(top_eps[23].text)
        thd_ep.append(top_eps[46].text)
    else:
        top_ep.append(None)
        sec_ep.append(None)
        thd_ep.append(None)

    ## scape popularity from review bar
    if review_bar is not None:
        popularity.append(int(review_bar.find("span", {"class": "subText"}).text.split()[0].replace(",","")))
    else:
        popularity.append(4000)

    ## scrape awards from awards bar
    if awards_bar is not None:
        blurbs = awards_bar.find_all("span",{"class":"awards-blurb"})
        if len(blurbs) > 1:
            awards_1.append(" ".join(blurbs[0].text.split()))
            awards_2.append(" ".join(blurbs[1].text.split()))
        elif len(blurbs) > 0:
            awards_1.append("N/A")
            awards_2.append(" ".join(blurbs[0].text.split()))
        else:
            awards_1.append("N/A")
            awards_2.append("N/A")
            
    else:
        awards_1.append("N/A")
        awards_2.append("N/A")

In [5]:
## create shows dataframe and assign lists of scraped data to dataframe columns
shows = pd.DataFrame()
shows["title"] = titles
shows["rating"] = ratings
shows["rating_count"] = rating_counts
shows["length"] = length
shows["level"] = level
shows["genre_1"] = genre_1
shows["genre_2"] = genre_2
shows["genre_3"] = genre_3
shows["start_year"] = start_years
shows["episodes"] = episodes
shows["top_ep"] = top_ep
shows["sec_ep"] = sec_ep
shows["thd_ep"] = thd_ep
shows["popularity"] = popularity
shows["awards_1"] = awards_1
shows["awards_2"] = awards_2

In [6]:
## make new dataframe with columns to be one hot encoded (categorical variables)
features = shows.iloc[:, 4:8]

In [7]:
## one hot encode the categorical data and put into a dataframe with column names
encoder = OneHotEncoder(sparse = False)
encoded_df = encoder.fit_transform(features)
features_df = pd.DataFrame(encoded_df, columns = encoder.get_feature_names())
features_df = features_df.astype(int)

In [8]:
## join the newly created columns to the original dataframe
shows = shows.join(features_df)

In [9]:
## drop shows with lack of top episode data
shows = shows.dropna(subset=['top_ep'])

In [10]:
## manually bin the episode length data into five bins
shows['quick'] = np.where(shows['length']<20, 1, 0)
shows['short'] = np.where((shows['length']>19.9) & (shows['length']<35), 1, 0)
shows['medium'] = np.where((shows['length']>34.9) & (shows['length']<45), 1, 0)
shows['long'] = np.where((shows['length']>44.9) & (shows['length']<65), 1, 0)
shows['series'] = np.where(shows['length']>64.9, 1, 0)

## group the one hot encoded genres into their true genre
shows['Action'] = np.where(shows['x1_Action'] | shows['x2_Action'] | shows['x3_Action'], 1, 0)
shows['Adventure'] = np.where(shows['x1_Adventure'] | shows['x2_Adventure'] | shows['x3_Adventure'], 1, 0)
shows['Animation'] = shows['x1_Animation']
shows['Biography'] = shows['x1_Biography']
shows['Comedy'] = np.where(shows['x1_Comedy'] | shows['x2_Comedy'] | shows['x3_Comedy'], 1, 0)
shows['Crime'] = np.where(shows['x1_Crime'] | shows['x2_Crime'] | shows['x3_Crime'], 1, 0)
shows['Drama'] = np.where(shows['x1_Drama'] | shows['x2_Drama'] | shows['x3_Drama'], 1, 0)
shows['Family'] = np.where(shows['x2_Family'] | shows['x3_Family'], 1, 0)
shows['Fantasy'] = np.where(shows['x2_Fantasy'] | shows['x3_Fantasy'], 1, 0)
shows['Horror'] = np.where(shows['x2_Horror'] | shows['x3_Horror'], 1, 0)
shows['Mystery'] = np.where(shows['x2_Mystery'] | shows['x3_Mystery'], 1, 0)
shows['Romance'] = np.where(shows['x2_Romance'] | shows['x3_Romance'], 1, 0)
shows['Short'] = shows['x2_Short']
shows['Sport'] = shows['x2_Sport']
shows['Music'] = shows['x3_Music']
shows['Sci-Fi'] = shows['x3_Sci-Fi']
shows['Thriller'] = shows['x3_Thriller']

In [11]:
## import csv of user-rated scores for each of shows
target = pd.read_csv('targets.csv', encoding='utf-8')

In [12]:
## merge shows dataframe with dataframe of user inputted ratings
shows.title.astype(str)
target.title.astype(str)
shows = shows.merge(target, on='title')

In [13]:
## export final data to dataframe
shows.to_csv('shows.csv', encoding='utf-8-sig')