### Import libraries

In [598]:
from time import time
import random
import requests
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import pandas as pd
import numpy as np
from time import sleep
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, Imputer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, ElasticNet 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.pipeline import Pipeline

%matplotlib inline


### Get the list of tv-show id's from 'imdb.com'

In [None]:
# list of tv-show ids with the most complete info (some research has been done)

page_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,\
             29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 49, 50, 54, 56, 57, 58, 59, 60,\
             61, 62, 63, 64, 65, 66, 67, 68, 69, 74, 76, 77, 78, 79, 80, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 99,\
             100, 101, 105, 107, 108, 110, 112, 114, 116, 117, 127, 128, 132, 135, 140, 156, 160, 161, 165, 166, 168,\
             169, 170, 181, 191, 192, 198]

show_ids = []

for page in page_list: 
    response = requests.get("http://www.imdb.com/search/title?title_type=tv_series&view=advanced&page={}&sort=num_votes,desc&ref_=adv_prv".format(page))
    print page
    print response.status_code
    HTML = response.text 
    
    titles = Selector(text=HTML).xpath('//h3[@class="lister-item-header"]/a/@href').extract()
    for title in titles:
        show_ids.append(title.split('/')[2])
    print "show ids length is:", len(show_ids)

### Get info about the tv-shows

In [None]:
show_rating = []
show_names = []
show_votes = []
show_episodes = []
show_genre = []
show_runtime2 = []
show_user_review = []
show_critic_review = []
show_runtime = []
show_start = []
show_cast = []

# iterate through the list of tv-show id's to get info
for title in show_ids:
    response = requests.get("http://www.imdb.com/title/{}/".format(title))
    print response.status_code, show_ids.index(title)
    HTML = response.text 
    
    # get a tv-shows ratings
    rating = Selector(text=HTML).xpath('//div[@class="ratingValue"]/strong/span/text()').extract()
    if rating == []:
        show_rating.append(None)
    else:
        show_rating.append(rating[0])
    
    # get a tv-shows names
    names = Selector(text=HTML).xpath('//h1[@itemprop="name"]/text()').extract()
    if names == []:
        show_names.append(None)
    else:
        show_names.append(names[0].strip())
    
    # get a tv-shows number of votes
    votes = Selector(text=HTML).xpath('//div[@class="imdbRating"]/a/span[@class="small"]/text()').extract()
    if votes == []:
        show_votes.append(None)
    else:
        show_votes.append(votes[0])

    # get a tv-shows number of episodes
    episodes = Selector(text=HTML).xpath('//span[@class="bp_sub_heading"]/text()').extract()
    if episodes == []:
        show_episodes.append(None)
    elif len(episodes) > 1:
        show_episodes.append(episodes[1].split(' ')[0])
    else:
        show_episodes.append(episodes[0].split(' ')[0])
    
    # get a tv-shows genres
    genre = Selector(text=HTML).xpath('//div[@class="subtext"]/a[1]/span[@class="itemprop"]/text()').extract()
    if genre == []:
        show_genre.append(None)
    else:
        show_genre.append(genre[0])
        
    # get a tv-shows runtime (first source)
    runtime2 = Selector(text=HTML).xpath('//time[@itemprop="duration"]/text()').extract()
    if runtime2 == []:
        show_runtime2.append(None)
    else:
        show_runtime2.append(runtime2[0].split('\n')[1].split()[0])
    
    # get a tv-shows user reviews
    user_review = Selector(text=HTML).xpath('//div[2]/span[@class="subText"]/a/text()').extract()
    if user_review == []:
        show_user_review.append(None)
    else:
        show_user_review.append(user_review[0].split()[0])

    # get a tv-shows critic reviews
    critic_review = Selector(text=HTML).xpath('//a[@href="externalreviews?ref_=tt_ov_rt"]/text()').extract()
    if critic_review == []:
        show_critic_review.append(None)
    else:
        show_critic_review.append(critic_review[0].split()[0])    
    
    # get a tv-shows user reviews
    response = requests.get("http://www.imdb.com/title/{}/technical?ref_=tt_dt_spec".format(title))
    print response.status_code
    HTML = response.text
    try:    
        runtime = Selector(text=HTML).xpath('//tr[@class="odd"]/td[2]/text()').extract()
        if runtime == []:
            show_runtime.append(None)
        else:    
            show_runtime.append(runtime[0].split('\n')[1].split()[0])
    except IndexError:
        show_runtime.append(None)

    # get a tv-shows start date
    response = requests.get("http://www.imdb.com/title/{}/ratings?ref_=tt_ql_op_4".format(title))
    print response.status_code
    HTML = response.text
    start = Selector(text=HTML).xpath('//div[@id="tn15title"]/h1/span/a/text()').extract()
    if start == []:
        show_start.append(None)
    else:    
        show_start.append(start[0])
       
    # get a tv-shows cast
    response = requests.get("http://www.imdb.com/title/{}/fullcredits?ref_=tt_cl_sm#cast".format(title))
    print response.status_code
    HTML = response.text

    cast = Selector(text=HTML).xpath('//td[@class="name"]/a/text()').extract()
    if cast == []:
        show_cast.append(None)
    else:    
        show_cast.append(cast[:10])
    
    sleep(0.5)






In [None]:
# transform show_cast (list of lists) to string_cast (list of strings)
string_cast = []
for cast in show_cast:
    if cast == None:
        string_cast.append(None)
        continue
    clean_cast = []
    for actor in cast:
        clean_cast.append(actor.strip())
    string_cast.append(', '.join(clean_cast))

In [None]:
# additional source for tv-shows runtime 
def get_entry(entry):
    res=requests.get('http://api.tvmaze.com/lookup/shows?imdb='+entry)
    print res.status_code
    if res.status_code == 200:
        results = json.loads(res.text)
        
        try:   
            runtime = results['runtime']
        except TypeError:
            runtime = 'NA'
        return runtime

import json

runtime_tvmaze = []
for element in show_ids:
    runtime_tvmaze.append(get_entry(element))
    print show_ids.index(element)
    sleep(0.7)    

### Create a Data Frame with all the information about tv-shows

In [None]:
data = pd.DataFrame.from_items([('id', show_ids), ('name', show_names), ('genre', show_genre),\
                            ('start_date', show_start), ('rating', show_rating_f), ('votes', show_votes_f),\
                            ('n_episodes', show_episodes_f), ('runtime1', show_runtimeS_f),\
                            ('runtime2', show_runtime_f), ('user_reviews', show_user_review_f),\
                            ('critic_reviews', show_critic_review_f), ('cast', string_cast),\
                            ('runtime_tvmaze', runtime_tvmaze)])

### Save the data to file

In [3]:
import pickle
# save data
pickle.dump( data, open( "data_df.p", "wb" ) )

In [396]:
# load data
data = pickle.load( open( "data_df.p", "rb" ) )

In [397]:
data.head()

Unnamed: 0,id,name,genre,start_date,votes,n_episodes,runtime1,runtime2,runtime_tvmaze,user_reviews,critic_reviews,cast,rating
0,tt0944947,Game of Thrones,Adventure,2011,1212353,73,57min,57,60.0,919,241,Alan Taylor|David Nutter|Alex Graves|Mark Mylo...,9.5
1,tt0903747,Breaking Bad,Crime,2008,1010045,62,49min,49,60.0,956,174,Michelle MacLaren|Adam Bernstein|Vince Gilliga...,9.5
2,tt1520211,The Walking Dead,Drama,2010,705001,115,44min,44,60.0,934,258,Greg Nicotero|Ernest R. Dickerson|Michael E. S...,8.5
3,tt1475582,Sherlock,Crime,2010,595233,15,1h,1,90.0,455,109,Paul McGuigan|Nick Hurran|Coky Giedroyc|Euros ...,9.2
4,tt0898266,The Big Bang Theory,Comedy,2007,574644,280,22min,22,30.0,447,113,Mark Cendrowski|Anthony Rich|Peter Chakos|Nico...,8.3


In [398]:
data.info() # summary of a DataFrame 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5350 entries, 0 to 5349
Data columns (total 13 columns):
id                5350 non-null object
name              5350 non-null object
genre             5347 non-null object
start_date        5350 non-null object
votes             5350 non-null object
n_episodes        5350 non-null object
runtime1          4451 non-null object
runtime2          4454 non-null object
runtime_tvmaze    3972 non-null float64
user_reviews      5032 non-null object
critic_reviews    3760 non-null object
cast              5337 non-null object
rating            5350 non-null object
dtypes: float64(1), object(12)
memory usage: 543.4+ KB


### Data cleaning

In [399]:
pd.options.mode.chained_assignment = None  # default='warn'

In [400]:
# imputing None's in 'genre' column with information from 'imdb.com'
data.genre[data.genre.isnull()] = ['Crime', 'Animation', 'Animation']

In [401]:
# check missing values in 'cast' column
data[data.cast.isnull()]

Unnamed: 0,id,name,genre,start_date,votes,n_episodes,runtime1,runtime2,runtime_tvmaze,user_reviews,critic_reviews,cast,rating
2700,tt2309302,Heathcliff,Animation,2012,1383,13,,,30.0,1.0,1.0,,7.5
3167,tt0816382,I Live with Models,Comedy,1995,989,0,,,,1.0,,,8.2
3206,tt2557496,Who Wants to Be a Superhero?,Drama,1989,955,0,,,,,,,7.1
3478,tt3062514,Charlie and Lola,Reality-TV,2005,745,31,25min,25.0,,,,,9.5
3796,tt2269368,Dudesons in America,Animation,2012,652,12,,,25.0,4.0,3.0,,7.4
3814,tt5764414,Grozovye vorota,Comedy,2016,648,16,1h,1.0,,1.0,,,7.5
4205,tt3563898,Gokukoku no Brynhildr,Animation,2014,505,13,,,25.0,2.0,1.0,,7.0
4526,tt0279557,Welcome to the Captain,Animation,1978,369,0,30min,30.0,,1.0,,,7.5
4693,tt1475525,Biz size asik olduk,Talk-Show,2009,327,0,,,,,,,8.0
4785,tt2752022,Twenty Good Years,Animation,2009,253,6,40min,40.0,,,,,7.1


In [402]:
# drop rows with missing values in 'cast' column
data = data.drop(data[data.cast.isnull()].index)

In [403]:
# fill missing values in column 'user_reviews' with 0, because no reviews had been left
data.user_reviews.fillna(0, inplace=True)

In [404]:
# fill missing values in column 'critic_reviews' with 0, because no reviews had been left
data.critic_reviews.fillna(0, inplace=True)

In [405]:
# replace '1' values in column "runtime2" to '60', because 1 represents 60 min
data.runtime2.replace('1', '60', inplace=True)

In [406]:
# change type of "start_date" variable to "integer"
data.start_date = data.start_date.map(lambda x: int(x))

In [407]:
# change type of "rating" variable to "float"
data.rating = data.rating.map(lambda x: float(x))

In [408]:
# Delete a comma
data.votes = data.votes.str.replace(',', '')

In [410]:
# change type of "votes" variable to "integer"
data.votes = data.votes.map(lambda x: int(x))

In [411]:
# change type of "n_episodes" variable to "integer"
data.n_episodes = data.n_episodes.map(lambda x: int(x))

In [412]:
# change value in "user_reviews" 
data.user_reviews[data.user_reviews == 'metacritic.com'] = '12'

In [413]:
# change type of "user_reviews" variable to "integer"
data.user_reviews = data.user_reviews.map(lambda x: int(x))

In [414]:
# change type of "critic_reviews" variable to "integer"
data.critic_reviews = data.critic_reviews.map(lambda x: int(x))

In [416]:
# replace NaN values in "runtime_tvmaze" variable with values from "runtime2" variable
data.runtime_tvmaze[data.runtime_tvmaze.isnull()] = data.runtime2[data.runtime_tvmaze.isnull()]

In [417]:
# check the NaN values in "runtime_tvmaze" variable
data.runtime_tvmaze.isnull().sum()

359

In [418]:
# droping the NaN values
data = data.drop(data[data.runtime_tvmaze.isnull()].index)

In [419]:
# replace value in "runtime_tvmaze" variable with actual value
data.runtime_tvmaze[data.runtime_tvmaze == '1.33'] = '30'

In [420]:
# change type of "runtime_tvmaze" variable to "integer"
data.runtime_tvmaze = data.runtime_tvmaze.map(lambda x: int(x))

In [421]:
# drop "runtime1" and "runtime2" variables
data = data.drop(['runtime1', 'runtime2'], axis=1)

In [423]:
# checking type of variables
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4978 entries, 0 to 5349
Data columns (total 11 columns):
id                4978 non-null object
name              4978 non-null object
genre             4978 non-null object
start_date        4978 non-null int64
votes             4978 non-null int64
n_episodes        4978 non-null int64
runtime_tvmaze    4978 non-null int64
user_reviews      4978 non-null int64
critic_reviews    4978 non-null int64
cast              4978 non-null object
rating            4978 non-null float64
dtypes: float64(1), int64(6), object(4)
memory usage: 466.7+ KB


In [597]:
# saving the cleaned data
pickle.dump( data, open( "data_df_clean.p", "wb" ) )