In [182]:
import requests 
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime
import time
from IPython.display import clear_output

# OPTIMIZATION : Calculating the start time
start_time = time.time()

#Dictionary for titles urls scraping
title_url_dict= {'title':[]}

#Generating the title list from the 'Electronic Arts' page on metacritic
user_agent = {'User-agent': 'Mozilla/5.0'}
url = 'https://www.metacritic.com/company/electronic-arts'
response  = requests.get(url, headers = user_agent)
soup = BeautifulSoup(response.text, 'html.parser')

#current page cout 58 (30 items per page)
page_count = int(soup.find('li', class_='page last_page').find('a').text)

for page in range (0,page_count):
        url = 'https://www.metacritic.com/company/electronic-arts?page='+str(page)
        response  = requests.get(url, headers = user_agent)
        soup = BeautifulSoup(response.text, 'html.parser')

        for title in soup.find_all('td', class_='title brief_metascore'):
                if title.find('a') == None:
                        break
                # CLEANUP: Removing mobile titles
                if '/ios/' not in title.find('a')['href']:
                        title_url_dict['title'].append(title.find('a')['href'])
        
        # PROGRESSLINE : % of pages processed - printed in the output bar
        clear_output()
        print("Progress: {:2.1%}".format(page / page_count))
        sys.stdout.flush()

#converting the dictionary to a dataframe
title_url_df = pd.DataFrame(title_url_dict)

#generating the titles urls array 
title_url_list = title_url_df['title'].to_numpy()

#generating the platforms and titles names arrays
title_url_df = title_url_df.title.str.split("/",expand=True,)
platforms = title_url_df[2].to_numpy()
titles = title_url_df[3].to_numpy()

# PROGRESSLINE : Completed - printed in the output bar
clear_output()
#print("Complete!")

# OPTIMIZATION : Calculating the time elapsed
print("--- %s seconds ---" % (time.time() - start_time))

--- 25.381119966506958 seconds ---


In [184]:
# OPTIMIZATION : Calculating the start time
start_time = time.time()

user_agent = {'User-agent': 'Mozilla/5.0'}

#Dictionary for titles info scraping
ea_title_dict = {'date':[], 'title_url':[], 'title_name':[], 'platform':[],'title_genres':[], 'release_date':[],
                'metascore':[], 'critic_reviews_count':[],'pos_critic_reviews_count':[],'mix_critic_reviews_count':[],'neg_critic_reviews_count':[],
                'agv_user_score':[], 'user_ratings_count':[],'pos_user_reviews_count':[],'mix_user_reviews_count':[],'neg_user_reviews_count':[]}

#Iterating through the list of titles to scrape the info
for i in range(0,len(title_url_list)):
        
        url = 'https://www.metacritic.com' + title_url_list[i]
        response  = requests.get(url, headers = user_agent)
        
        # Checking that the page exists
        if response.status_code == 200: 
           
            soup = BeautifulSoup(response.text, 'html.parser')
            url_base = title_url_list[i]
            
            # CLEANUP - Removing titles with no metacritic score or Canceled
            critic_score = soup.find('a', class_='metascore_anchor', href=url_base+'/critic-reviews')
            release_date = soup.find('li', class_='summary_detail release_data').find('span', class_='data').text

            if critic_score != None and release_date != 'Canceled':

                # Today's date, title url, title name, platform & metascore
                Today_Date = time.strftime("%Y%m%d")

                ea_title_dict['date'].append(Today_Date)
                ea_title_dict['title_url'].append(title_url_list[i])
                ea_title_dict['title_name'].append(titles[i])
                ea_title_dict['platform'].append(platforms[i])
                ea_title_dict['metascore'].append(int(critic_score.find('span').text))
        
                #average user score
                agv_user_score = soup.find('a', class_='metascore_anchor', href=url_base+'/user-reviews')
                if agv_user_score == None or agv_user_score.find('div').text == 'tbd':
                    ea_title_dict['agv_user_score'].append(float(0))
                else:
                    ea_title_dict['agv_user_score'].append(float(agv_user_score.find('div').text))
                
                #release_date
                release_date = soup.find('li', class_='summary_detail release_data')
                if release_date == None:
                    ea_title_dict['release_date'].append(float(0))
                else:
                    #exporting the date in the datetime format or keeping the original format for dates that do not fit the format
                    try :
                        # Date Format 'Oct 13, 2020'
                        release_date = datetime.strptime(release_date.find('span', class_='data').text,'%b %d, %Y')
                        ea_title_dict['release_date'].append(release_date)
                    except ValueError:
                        try:
                            # Date Format '2020'
                            release_date = datetime.strptime(release_date.find('span', class_='data').text,'%Y')
                            ea_title_dict['release_date'].append(release_date)
                        except ValueError:
                            try:
                                # Date Format 'September 2020'
                                release_date = datetime.strptime(release_date.find('span', class_='data').text,'%B %Y')
                                ea_title_dict['release_date'].append(release_date)
                            except ValueError:
                                # Other Date Formats stored as text
                                ea_title_dict['release_date'].append(release_date.find('span', class_='data').text)
            
                #List of genres separated by comma
                title_genres_text = ''

                # CLEANUP - Reducing the number of genres to 18 (as defined by Metacritic) + Miscellaneous 
                title_genres_list = ['Action','Adventure','Action Adventure','Fighting','First-Person','Flight','Party','Platformer','Puzzle',
                                    'Racing','Real-Time','Role-Playing','Simulation','Sports','Strategy','Third-Person','Turn-Based','Wargames','Wrestling','Miscellaneous']

                title_genres = soup.find('li', class_='summary_detail product_genre')
                
                if title_genres == None:
                    ea_title_dict['title_genres'].append('')
                else:
                    for genre in title_genres.find_all('span', class_='data'):
                            # CLEANUP - Removing duplicated genre names
                            if genre.text in title_genres_list and genre.text not in title_genres_text:
                                if title_genres_text == '':
                                    title_genres_text = title_genres_text+genre.text
                                else:
                                    title_genres_text = title_genres_text+','+genre.text
                    if title_genres_text == '':
                        ea_title_dict['title_genres'].append('Miscellaneous')
                    else:
                        ea_title_dict['title_genres'].append(title_genres_text)

                #number of critic reviews and number of user reviews
                critic_reviews = 0
                user_reviews = 0

                for summary in soup.find_all('div', class_='summary'):
                        if summary.find('a', href=url_base+'/critic-reviews') != None:
                            critic_reviews = summary.find('a', href=url_base+'/critic-reviews').find('span').text
                            ea_title_dict['critic_reviews_count'].append(int(critic_reviews))
                        
                        if summary.find('a', href=url_base+'/user-reviews') != None:
                            user_reviews = summary.find('a', href=url_base+'/user-reviews').text
                            ea_title_dict['user_ratings_count'].append(int(user_reviews.replace(' Ratings','')))

                if critic_reviews == 0:
                            ea_title_dict['critic_reviews_count'].append(int(0))
                if user_reviews == 0:
                            ea_title_dict['user_ratings_count'].append(int(0))
            
                url_base_count = url_base +'/critic-reviews?dist='

                #number of positive critic reviews
                pos_critic_reviews_count = soup.find('a', href=url_base_count+'positive')
                if pos_critic_reviews_count == None:
                    ea_title_dict['pos_critic_reviews_count'].append(int(0))
                else:
                    ea_title_dict['pos_critic_reviews_count'].append(int(pos_critic_reviews_count.find('span', class_="count").text))
                
                #number of neutral critic reviews
                mix_critic_reviews_count = soup.find('a', href=url_base_count+'neutral')
                if mix_critic_reviews_count == None:
                    ea_title_dict['mix_critic_reviews_count'].append(int(0))
                else:
                    ea_title_dict['mix_critic_reviews_count'].append(int(mix_critic_reviews_count.find('span', class_="count").text))
                
                #number of negative critic reviews
                neg_critic_reviews_count = soup.find('a', href=url_base_count+'negative')
                if neg_critic_reviews_count == None:
                    ea_title_dict['neg_critic_reviews_count'].append(int(0))
                else:
                    ea_title_dict['neg_critic_reviews_count'].append(int(neg_critic_reviews_count.find('span', class_="count").text))
            
                url_base_count = url_base +'/user-reviews?dist='
                
                #number of positive user reviews
                pos_user_reviews_count = soup.find('a', href=url_base_count+'positive')
                if pos_user_reviews_count == None:
                    ea_title_dict['pos_user_reviews_count'].append(int(0))
                else:
                    ea_title_dict['pos_user_reviews_count'].append(int(pos_user_reviews_count.find('span', class_="count").text.replace(',','')))
                
                #number of neutral user reviews
                mix_user_reviews_count = soup.find('a', href=url_base_count+'neutral')
                if mix_user_reviews_count == None:
                    ea_title_dict['mix_user_reviews_count'].append(int(0))
                else:
                    ea_title_dict['mix_user_reviews_count'].append(int(mix_user_reviews_count.find('span', class_="count").text.replace(',','')))
                
                #number of negative user reviews
                neg_user_reviews_count = soup.find('a', href=url_base_count+'negative')
                if neg_user_reviews_count == None:
                    ea_title_dict['neg_user_reviews_count'].append(int(0))
                else:
                    ea_title_dict['neg_user_reviews_count'].append(int(neg_user_reviews_count.find('span', class_="count").text.replace(',','')))

            # Progress - % of titles processed - printed in the output bar
            clear_output()
            print("Progress: {:2.1%}".format(i / len(title_url_list))+" ")
            sys.stdout.flush()

# Progress - Completed - printed in the output bar
clear_output()
#print("Complete!")

# OPTIMIZATION : Calculating the time elapsed
print("--- %s seconds ---" % (time.time() - start_time))

--- 1667.2636439800262 seconds ---


In [185]:
#Converting the dictionary into the dataframe
ea_title_df = pd.DataFrame(ea_title_dict, columns = ['date', 'title_url', 'title_name', 'platform','title_genres', 'release_date',
                                     'metascore', 'critic_reviews_count','pos_critic_reviews_count','mix_critic_reviews_count','neg_critic_reviews_count',
                                     'agv_user_score', 'user_ratings_count', 'pos_user_reviews_count','mix_user_reviews_count','neg_user_reviews_count'])

In [177]:
#Cleaning up the data frame
ea_title_df.drop(ea_title_df[ea_title_df.release_date == 'Canceled'].index, inplace=True)

In [186]:
#Exporting the the dataframe into an excel file with the date and time in the title
Date_Time = time.strftime("%Y%m%d-%H%M%S")
excelfilename = 'ea_metacritic_'+Date_Time +".xlsx"
ea_title_df.to_excel (r'/Users/liuba/Desktop/files/'+excelfilename, index = False, header=True)