# WEB PROJECT - PART 2: WEBSCRAPING

In [1]:
import requests
from bs4 import BeautifulSoup
from lxml import html
from lxml.html import fromstring
import re,os
import pandas as pd
import numpy as np

#### Generating data storage folder

In [2]:
destfolder='./Results-Webscraping'
if not os.path.exists(destfolder):
    os.mkdir(destfolder)

## RETIREVING INFORMATION FROM FILMAFFINITY WEBPAGE
We are going to obtain the top 30 TV series <br>
This limitation comes from the webpage itself an its design <br>
Even doing multiple request in a row you wont obtain more than 30 movies without any changes<br>

In [3]:
# Filmaffinity url for top TV series ever
url='https://www.filmaffinity.com/es/topgen.php?genre=TV_SE&country=&fromyear=&toyear=&nodoc'

Requesting the info and interpreting it with BeautifulSoup module from bs4

In [4]:
request=requests.get(url).content
soup = BeautifulSoup(request,"lxml")

Loop to retrieve the concrete information of title, type, year, country,avg rating from users and total amount of votes received

In [5]:
counter=0
titlinfo=[]
rating=[]
# <ul> elements are the ones with TV series info
for item in soup.find_all('ul'):
    
    # Title, type, year and country info
    a=item.find_all('li',{'class':'content'})
    # Avg rating
    b=item.find_all('div',{'class':'avg-rating'})
    # Total amount of votes
    c=item.find_all('div',{'class':'rat-count'})
    
    if a!=[] and b!=[] and c!=[]:
        counter+=1
        # Title, type, year
        at=a[0].find('div',attrs={'class':'mc-title'}).get_text().replace(')','').split('(')
        # Country 
        ac=a[0].find('div',attrs={'class':'mc-title'}).select('img')[0].attrs['title']
        if len(at)>3:
            titlinfo.append([at[0].strip(),at[-2].strip(),at[-1].strip(),ac.strip()])
        else:
            titlinfo.append([at[0].strip(),at[1].strip(),at[2].strip(),ac.strip()])
        avg_rating=b[0].get_text().strip().replace(',','.')
        num_rat=c[0].get_text().strip().replace('.','')
        rating.append([avg_rating,num_rat])
        
filmaffinity_tv_series=pd.DataFrame(np.c_[titlinfo,rating],columns=['Title','Type','Year','Country','Avg Rating','Number of votes']).drop_duplicates(subset='Title').reset_index(drop=True)
filmaffinity_tv_series['Number of votes']=filmaffinity_tv_series['Number of votes'].astype('int64')
filmaffinity_tv_series['Avg Rating']=filmaffinity_tv_series['Avg Rating'].astype('float32')
filmaffinity_tv_series['Year']=filmaffinity_tv_series['Year'].astype('int32')
print(chr(27)+"[1;31m"+"Top %s TV Series for Filmaffinity Webpage: " %int(len(filmaffinity_tv_series)))
display(filmaffinity_tv_series.head(10))

[1;31mTop 30 TV Series for Filmaffinity Webpage: 


Unnamed: 0,Title,Type,Year,Country,Avg Rating,Number of votes
0,The Wire,Serie de TV,2002,Estados Unidos,8.9,44478
1,Breaking Bad,Serie de TV,2008,Estados Unidos,8.8,95784
2,Juego de tronos,Serie de TV,2011,Estados Unidos,8.6,105965
3,Los Simpson,Serie de TV,1989,Estados Unidos,8.6,171157
4,Hermanos de sangre,Miniserie de TV,2001,Estados Unidos,8.5,54671
5,Los Soprano,Serie de TV,1999,Estados Unidos,8.5,54403
6,True Detective,Serie de TV,2014,Estados Unidos,8.5,56391
7,Rick y Morty,Serie de TV,2013,Estados Unidos,8.4,21131
8,Monty Python's Flying Circus,Serie de TV,1969,Reino Unido,8.4,11904
9,El decálogo,Miniserie de TV,1988,Polonia,8.4,1607


Grouping data by country and retrieving the mean of avg rating, the sum of votes per country, Min and Max Year and the count of TV series per country

In [6]:
summary=filmaffinity_tv_series.groupby('Country').agg({'Avg Rating':'mean','Number of votes':'sum','Year':['min','max'],'Country':'count'})
summary.columns=summary.columns.droplevel()
summary.columns=['Avg Rating','Total Num of Votes','Year Min','Year Max','Count']
summary=summary.round({'Avg Rating': 2})
summary=summary.sort_values('Total Num of Votes',ascending=False)
summary

Unnamed: 0_level_0,Avg Rating,Total Num of Votes,Year Min,Year Max,Count
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Estados Unidos,8.39,810701,1989,2015,15
Reino Unido,8.3,57960,1969,2013,4
Japón,8.17,53435,2006,2011,6
Italia,8.3,7824,2008,2008,1
Polonia,8.4,1607,1988,1988,1
Noruega,8.1,736,2015,2015,1
Suecia,8.2,231,1973,1983,2


Saving both dataframes as CSV files

In [7]:
filmaffinity_tv_series.to_csv(destfolder+"/TOP_30_TVseries_Filmaffinity.csv")
summary.to_csv(destfolder+"/TOP_30_Filmaffinty.csv")

# IMDB

In [8]:
urlimdb='https://www.imdb.com/list/ls004729995/'

In [9]:
requesti=requests.get(urlimdb).content
soupi = BeautifulSoup(requesti,"lxml")

Loop to retrieve the concrete information of title, year, avg rating from users and total amount of votes received

In [10]:
ap=soupi.find_all('div',{'class':'lister-item-content'})
title=[]
rating=[]
counter=0
for item in ap:
    if a!=[]:
        counter+=1
        a=item.select('a')[0].get_text().strip()
        b=item.find('span',{'class':'lister-item-year text-muted unbold'}).get_text().replace('(','').replace(')','').strip()
        c=item.find('span',{'class':'ipl-rating-star__rating'}).get_text()
        d=item.find('span',{'name':'nv'}).get_text().replace(',','')
        title.append([a,b])
        rating.append([c,d])
        if counter==30:
            break

In [11]:
imdb=pd.DataFrame(np.c_[title,rating],columns=['Title','Year','Avg Rating','Num of Votes'])
imdb.columns=['imdb.'+item if item!='Title' else 'Title' for item in imdb.columns.tolist()]
imdb['Title']=[re.sub(r'\([^)]*\)', '', 'The Wire (Bajo escucha)').strip() if '(' in item and ')' in item else item for item in imdb['Title']]
imdb['imdb.Year']=[int(re.findall('\d*',item)[0].strip()) for item in imdb['imdb.Year']]
display(imdb)

Unnamed: 0,Title,imdb.Year,imdb.Avg Rating,imdb.Num of Votes
0,Breaking Bad,2008,9.5,1128407
1,The Wire,2002,9.3,233170
2,"Te quiero, Lucy",1951,8.3,20146
3,Seinfeld,1989,8.9,212723
4,El ala oeste de la Casa Blanca,1999,8.7,53758
5,The Oprah Winfrey Show,1986,5.1,4270
6,La hora de Bill Cosby,1984,7.4,28212
7,Los Soprano,1999,9.2,236433
8,The Tonight Show Starring Johnny Carson,1962,8.4,2674
9,60 Minutes,1968,7.5,2589


To compare results I have to normalize names

In [12]:
filmaffinity_tv_series_mod=filmaffinity_tv_series.copy()
filmaffinity_tv_series_mod['Title']=[re.sub(r'\([^)]*\)', '', 'The Wire (Bajo escucha)').strip() if '(' in item and ')' in item else item for item in filmaffinity_tv_series_mod['Title']]
filmaffinity_tv_series_mod=filmaffinity_tv_series_mod[['Title','Year','Avg Rating','Number of votes']]
filmaffinity_tv_series_mod.columns=['filmaffinity.'+item  if item!='Title' else 'Title' for item in filmaffinity_tv_series_mod.columns.tolist()]

Comparision of TOP 30 for both pages. Results are obtained just concatenating both dataframes by columns

In [13]:
info_top_30=pd.concat((filmaffinity_tv_series_mod,imdb),axis=1)
display(info_top_30.head(10))

Unnamed: 0,Title,filmaffinity.Year,filmaffinity.Avg Rating,filmaffinity.Number of votes,Title.1,imdb.Year,imdb.Avg Rating,imdb.Num of Votes
0,The Wire,2002,8.9,44478,Breaking Bad,2008,9.5,1128407
1,Breaking Bad,2008,8.8,95784,The Wire,2002,9.3,233170
2,Juego de tronos,2011,8.6,105965,"Te quiero, Lucy",1951,8.3,20146
3,Los Simpson,1989,8.6,171157,Seinfeld,1989,8.9,212723
4,Hermanos de sangre,2001,8.5,54671,El ala oeste de la Casa Blanca,1999,8.7,53758
5,Los Soprano,1999,8.5,54403,The Oprah Winfrey Show,1986,5.1,4270
6,True Detective,2014,8.5,56391,La hora de Bill Cosby,1984,7.4,28212
7,Rick y Morty,2013,8.4,21131,Los Soprano,1999,9.2,236433
8,Monty Python's Flying Circus,1969,8.4,11904,The Tonight Show Starring Johnny Carson,1962,8.4,2674
9,El decálogo,1988,8.4,1607,60 Minutes,1968,7.5,2589


Merging Both data frames on Title by an outer join for the TOP 10 TV series

In [14]:
compare_webpages_top10=filmaffinity_tv_series_mod[:10].merge(imdb[:10],on='Title',how='outer')
compare_webpages_top10=compare_webpages_top10.drop_duplicates('Title')
compare_webpages_top10=compare_webpages_top10.fillna('0')
compare_webpages_top10['filmaffinity.Year']=compare_webpages_top10['filmaffinity.Year'].astype('int64')
compare_webpages_top10['filmaffinity.Number of votes']=compare_webpages_top10['filmaffinity.Number of votes'].astype('int64')
compare_webpages_top10['imdb.Year']=compare_webpages_top10['imdb.Year'].astype('int64')
compare_webpages_top10

Unnamed: 0,Title,filmaffinity.Year,filmaffinity.Avg Rating,filmaffinity.Number of votes,imdb.Year,imdb.Avg Rating,imdb.Num of Votes
0,The Wire,2002,8.9,44478,2002,9.3,233170
1,Breaking Bad,2008,8.8,95784,2008,9.5,1128407
2,Juego de tronos,2011,8.6,105965,0,0.0,0
3,Los Simpson,1989,8.6,171157,0,0.0,0
4,Hermanos de sangre,2001,8.5,54671,0,0.0,0
5,Los Soprano,1999,8.5,54403,1999,9.2,236433
6,True Detective,2014,8.5,56391,0,0.0,0
7,Rick y Morty,2013,8.4,21131,0,0.0,0
8,Monty Python's Flying Circus,1969,8.4,11904,0,0.0,0
9,El decálogo,1988,8.4,1607,0,0.0,0


Saving the 3 data frames, imdb TOP 30, TOP 30 concatenate for IMDB and FILMAFFINITY, and TOP 10 IMDB and FILMAFFINITY TV series

In [15]:
imdb.to_csv(destfolder+'/TOP_30_IMDB.csv')
info_top_30.to_csv(destfolder+'/CONCAT_TOP_30_IMDB_FILMAFFINITY.csv')
compare_webpages_top10.to_csv(destfolder+'/OUTER_JOIN_TOP_10_IMDB_FILMAFFINITY.csv')

___