In [1]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv

In [2]:
URL = "https://www.metacritic.com"
HEADERS = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
HEADERS_MOBILE = { 'User-Agent' : 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1'}

In [3]:
def games_attributes_init():
    
    data = {
        'game_name': [],
        'platform': [],
        'publisher': [],
        'release_date': [],
        'meta_scroe': [],
        'user_score': [],
        'develeoper': [],
        'genres': [],
        'num_of_players': [],
        'rating': [],
        'user_positive_review': [],
        'user_negative_review': [],
        'user_mixed_review': [],
        'critic_positive_review': [],
        'critic_negative_review': [],
        'critic_mixed_review': [],
    }
    
    return data

In [4]:
# ***** if ther is index_list append to csv with index_list, else: make new csv **** #
def save_data_to_csv_file(data, name, index_list=None):
    if(index_list):
        df = pd.DataFrame(data=data, index=index_list)
        df.to_csv(f'games_data/{name}.csv', mode='a', header=False, index=index_list)
    else:
        df = pd.DataFrame(data=data)
        df.to_csv(f'games_data/{name}.csv')

In [5]:
def check_status_code(result, mobile_result):
    if((result.status_code != 200) or (mobile_result.status_code != 200)):
        print(f"Status code: {result.status_code}")
        print(f"Status code mobile: {mobile_result.status_code}")
        return False
    return True
    

In [41]:
def get_games_data(href_csv_file, save_name):

    games_attributes = games_attributes_init()
    starting_index = 1

    try:
        with open(f'games_data/{save_name}.csv', 'r') as game_file:
            games_list = list(csv.reader(game_file))
            last_row = games_list[-1][0]
            if(last_row):
                starting_index = int(last_row) + 2
    except:
        save_data_to_csv_file(games_attributes, save_name)
        
    with open(href_csv_file, 'r') as csv_file:
        hrefs = list(csv.reader(csv_file))
    
    
    indexes = []
    for href in hrefs[starting_index:]:
        print(f"Processing: {href[0]}")
        
        url = URL + href[1]
        result = requests.get(url, headers=HEADERS)
        mobile_result = requests.get(url, headers=HEADERS_MOBILE)
        
        if(check_status_code(result, mobile_result) is False):
            continue
        
        indexes.append(int(href[0]))
        
        mobile_soup = BeautifulSoup(mobile_result.content, "html.parser")
        soup = BeautifulSoup(result.content, "html.parser")
        
        game_name_soup = soup.find("div", class_="product_title")
        games_attributes['game_name'].append(game_name_soup.h1.text.strip())
        
        platform_soup = soup.find("div", class_="product_title").span
        games_attributes['platform'].append(platform_soup.text.strip())
        
        release_date_soup = soup.find("li", class_="summary_detail release_data").find("span", class_="data")
        games_attributes['release_date'].append(release_date_soup.text.strip())
        
        user_score_soup = soup.find("div", class_="userscore_wrap")
        if(user_score_soup):
            games_attributes['user_score'].append(user_score_soup.find(class_="metascore_w").text.strip())
        else:
            games_attributes['user_score'].append("NaN")
        
        meta_scroe_soup = soup.find("span", itemprop="ratingValue")
        if(meta_scroe_soup):
            games_attributes['meta_scroe'].append(meta_scroe_soup.text.strip())
        else:
            games_attributes['meta_scroe'].append("NaN")
        
        publisher_soup = soup.find("li", class_="summary_detail publisher")
        if(publisher_soup):
            games_attributes['publisher'].append(publisher_soup.a.text.strip())
        else:
            games_attributes['publisher'].append("NaN")
        
        develeoper_soup = soup.find("li", class_="summary_detail developer")
        if(develeoper_soup):
            games_attributes['develeoper'].append(develeoper_soup.a.text.strip())
        else:
            games_attributes['develeoper'].append("NaN")

        num_of_players_soup = soup.find("li", class_="summary_detail product_players")
        if(num_of_players_soup):
            games_attributes['num_of_players'].append(num_of_players_soup.find(class_="data").text.strip())
        else:
            games_attributes['num_of_players'].append("NaN")
        
        rating_soup = soup.find("li", class_="summary_detail product_rating")
        if(rating_soup):
            games_attributes['rating'].append(rating_soup.find(class_= "data").text.strip())
        else:
            games_attributes['rating'].append("NaN")

        genres_soup = soup.find("li", class_="summary_detail product_genre").find_all("span")
        game_genres = []
        for genre in genres_soup[1:]:
            game_genres.append(genre.text.strip())
        geners_string = ", ".join(game_genres)
        games_attributes['genres'].append(geners_string)

        critic_user_distributations = mobile_soup.find_all("div", class_= "distributions")
        # ***** critic review Distributations **** #
        if(len(critic_user_distributations) > 0):
            critic_distributions = critic_user_distributations[0].find_all(class_="number")
            games_attributes['critic_positive_review'].append(critic_distributions[0].text.split()[0])
            games_attributes['critic_negative_review'].append(critic_distributions[2].text.split()[0])
            games_attributes['critic_mixed_review'].append(critic_distributions[1].text.split()[0])
        else:
            games_attributes['critic_positive_review'].append("NaN")
            games_attributes['critic_negative_review'].append("NaN")
            games_attributes['critic_mixed_review'].append("NaN")
        
        # ***** User review Distributations **** #
        if(len(critic_user_distributations) > 1):
            user_distributions = critic_user_distributations[1].find_all(class_="number")
            games_attributes['user_positive_review'].append(user_distributions[0].text.split()[0])
            games_attributes['user_negative_review'].append(user_distributions[2].text.split()[0])
            games_attributes['user_mixed_review'].append(user_distributions[1].text.split()[0])
        else:
            games_attributes['user_positive_review'].append("NaN")
            games_attributes['user_negative_review'].append("NaN")
            games_attributes['user_mixed_review'].append("NaN")
        
        # ***** saving changes evry 50 loops **** #
        num_of_games = int(href[0])
        if((num_of_games % 50 == 0) and (num_of_games != 0)):
            save_data_to_csv_file(games_attributes, save_name, index_list=indexes)
            print("saved")
            #  reseting games_attributes and indexes
            games_attributes = games_attributes_init()
            indexes = []


    if(len(indexes) > 0):
        save_data_to_csv_file(games_attributes, save_name, index_list=indexes)
    print("fin")


In [None]:
## saving the: Top Rated Games, by runing the function on the 'good_games.csv'

In [7]:
get_games_data('hrefs_csv/good_games.csv', 'top_rated_games')

Processing: 3501
Processing: 3502
Processing: 3503
Processing: 3504
Processing: 3505
Processing: 3506
Processing: 3507
Processing: 3508
Processing: 3509
Processing: 3510
Processing: 3511
Processing: 3512
Processing: 3513
Processing: 3514
Processing: 3515
Processing: 3516
Processing: 3517
Processing: 3518
Processing: 3519
Processing: 3520
Processing: 3521
Processing: 3522
Processing: 3523
Processing: 3524
Processing: 3525
Processing: 3526
Processing: 3527
Processing: 3528
Processing: 3529
Processing: 3530
Processing: 3531
Processing: 3532
Processing: 3533
Processing: 3534
Processing: 3535
Processing: 3536
Processing: 3537
Processing: 3538
Processing: 3539
Processing: 3540
Processing: 3541
Processing: 3542
Processing: 3543
Processing: 3544
Processing: 3545
Processing: 3546
Processing: 3547
Processing: 3548
Processing: 3549
Processing: 3550
saved
Processing: 3551
Processing: 3552
Processing: 3553
Processing: 3554
Processing: 3555
Processing: 3556
Processing: 3557
Processing: 3558
Processi

In [9]:
get_games_data('hrefs_csv/good_games.csv', 'top_rated_games')

Processing: 3751
Processing: 3752
Processing: 3753
Processing: 3754
Processing: 3755
Processing: 3756
Processing: 3757
Processing: 3758
Processing: 3759
Processing: 3760
Processing: 3761
Processing: 3762
Processing: 3763
Processing: 3764
Processing: 3765
Processing: 3766
Processing: 3767
Processing: 3768
Processing: 3769
Processing: 3770
Processing: 3771
Processing: 3772
Processing: 3773
Processing: 3774
Processing: 3775
Processing: 3776
Processing: 3777
Processing: 3778
Processing: 3779
Processing: 3780
Processing: 3781
Processing: 3782
Processing: 3783
Processing: 3784
Processing: 3785
Processing: 3786
Processing: 3787
Processing: 3788
Processing: 3789
Processing: 3790
Processing: 3791
Processing: 3792
Processing: 3793
Processing: 3794
Processing: 3795
Status code: 404
Status code mobile: 404
Processing: 3796
Processing: 3797
Processing: 3798
Processing: 3799
Processing: 3800
saved
Processing: 3801
Processing: 3802
Processing: 3803
Processing: 3804
Processing: 3805
Processing: 3806
P

In [None]:
## saving the: Lowest Rated Games, by runing the function on the 'bad_games.csv'

In [36]:
get_games_data('hrefs_csv/bad_games.csv', 'lowest_rated_games')

Processing: 1551
Processing: 1552
Processing: 1553
Processing: 1554
Processing: 1555
Processing: 1556
Processing: 1557
Processing: 1558
Processing: 1559
Processing: 1560
Processing: 1561
Processing: 1562
Processing: 1563
Processing: 1564
Processing: 1565
Processing: 1566
Processing: 1567
Processing: 1568
Processing: 1569
Processing: 1570
Processing: 1571
Processing: 1572
Processing: 1573
Processing: 1574
Processing: 1575
Processing: 1576
Processing: 1577
Processing: 1578
Processing: 1579
Processing: 1580
Processing: 1581
Processing: 1582
Processing: 1583
Processing: 1584
Processing: 1585
Processing: 1586
Processing: 1587
Processing: 1588
Processing: 1589
Processing: 1590
Processing: 1591
Processing: 1592
Processing: 1593
Processing: 1594
Processing: 1595
Processing: 1596
Processing: 1597
Processing: 1598
Processing: 1599
Processing: 1600
saved
Processing: 1601
Processing: 1602
Processing: 1603
Processing: 1604
Processing: 1605
Processing: 1606
Processing: 1607
Processing: 1608
Processi

AttributeError: 'NoneType' object has no attribute 'find'

In [42]:
get_games_data('hrefs_csv/bad_games.csv', 'lowest_rated_games')

Processing: 1851
Processing: 1852
Processing: 1853
Processing: 1854
Processing: 1855
Processing: 1856
Processing: 1857
Processing: 1858
Processing: 1859
Processing: 1860
Processing: 1861
Processing: 1862
Processing: 1863
Processing: 1864
Processing: 1865
Processing: 1866
Processing: 1867
Processing: 1868
Processing: 1869
Processing: 1870
Processing: 1871
Processing: 1872
Processing: 1873
Processing: 1874
Processing: 1875
Processing: 1876
Processing: 1877
Processing: 1878
Processing: 1879
Processing: 1880
Processing: 1881
Processing: 1882
Processing: 1883
Processing: 1884
Processing: 1885
Processing: 1886
Processing: 1887
Processing: 1888
Processing: 1889
Processing: 1890
Processing: 1891
Processing: 1892
Processing: 1893
Processing: 1894
Processing: 1895
Processing: 1896
Processing: 1897
Processing: 1898
Processing: 1899
Processing: 1900
saved
Processing: 1901
Processing: 1902
Processing: 1903
Processing: 1904
Processing: 1905
Processing: 1906
Processing: 1907
Processing: 1908
Processi

Processing: 2330
Processing: 2331
Processing: 2332
Processing: 2333
Processing: 2334
Processing: 2335
Processing: 2336
Processing: 2337
Processing: 2338
Processing: 2339
Processing: 2340
Processing: 2341
Processing: 2342
Processing: 2343
Processing: 2344
Processing: 2345
Processing: 2346
Processing: 2347
Processing: 2348
Processing: 2349
Processing: 2350
saved
Processing: 2351
Processing: 2352
Processing: 2353
Processing: 2354
Processing: 2355
Processing: 2356
Processing: 2357
Processing: 2358
Processing: 2359
Processing: 2360
Processing: 2361
Processing: 2362
Processing: 2363
Processing: 2364
Processing: 2365
Processing: 2366
Processing: 2367
Processing: 2368
Processing: 2369
Processing: 2370
Processing: 2371
Processing: 2372
Processing: 2373
Processing: 2374
Processing: 2375
Processing: 2376
Processing: 2377
Processing: 2378
Processing: 2379
Processing: 2380
Processing: 2381
Processing: 2382
Processing: 2383
Processing: 2384
Processing: 2385
Processing: 2386
Processing: 2387
Processi

Processing: 2809
Processing: 2810
Processing: 2811
Processing: 2812
Processing: 2813
Processing: 2814
Processing: 2815
Processing: 2816
Processing: 2817
Processing: 2818
Processing: 2819
Processing: 2820
Processing: 2821
Processing: 2822
Processing: 2823
Processing: 2824
Processing: 2825
Processing: 2826
Processing: 2827
Processing: 2828
Processing: 2829
Processing: 2830
Processing: 2831
Processing: 2832
Processing: 2833
Processing: 2834
Processing: 2835
Processing: 2836
Processing: 2837
Processing: 2838
Processing: 2839
Processing: 2840
Processing: 2841
Processing: 2842
Processing: 2843
Processing: 2844
Processing: 2845
Processing: 2846
Processing: 2847
Processing: 2848
Processing: 2849
Processing: 2850
saved
Processing: 2851
Processing: 2852
Processing: 2853
Processing: 2854
Processing: 2855
Processing: 2856
Processing: 2857
Processing: 2858
Processing: 2859
Processing: 2860
Processing: 2861
Processing: 2862
Processing: 2863
Processing: 2864
Processing: 2865
Processing: 2866
Processi

Processing: 3286
Processing: 3287
Processing: 3288
Processing: 3289
Processing: 3290
Processing: 3291
Processing: 3292
Processing: 3293
Processing: 3294
Processing: 3295
Processing: 3296
Processing: 3297
Processing: 3298
Processing: 3299
Processing: 3300
saved
Processing: 3301
Processing: 3302
Processing: 3303
Processing: 3304
Processing: 3305
Processing: 3306
Processing: 3307
Processing: 3308
Processing: 3309
Processing: 3310
Processing: 3311
Processing: 3312
Processing: 3313
Processing: 3314
Processing: 3315
Processing: 3316
Processing: 3317
Processing: 3318
Processing: 3319
Processing: 3320
Processing: 3321
Processing: 3322
Processing: 3323
Processing: 3324
Processing: 3325
Processing: 3326
Processing: 3327
Processing: 3328
Processing: 3329
Processing: 3330
Processing: 3331
Processing: 3332
Processing: 3333
Processing: 3334
Processing: 3335
Processing: 3336
Processing: 3337
Processing: 3338
Processing: 3339
Processing: 3340
Processing: 3341
Processing: 3342
Processing: 3343
Processi

Processing: 3765
Processing: 3766
Processing: 3767
Processing: 3768
Processing: 3769
Processing: 3770
Processing: 3771
Processing: 3772
Processing: 3773
Processing: 3774
Processing: 3775
Processing: 3776
Processing: 3777
Processing: 3778
Processing: 3779
Processing: 3780
Processing: 3781
Processing: 3782
Processing: 3783
Processing: 3784
Processing: 3785
Processing: 3786
Processing: 3787
Processing: 3788
Processing: 3789
Processing: 3790
Processing: 3791
Processing: 3792
Processing: 3793
Processing: 3794
Processing: 3795
Processing: 3796
Processing: 3797
Processing: 3798
Processing: 3799
Processing: 3800
saved
Processing: 3801
Processing: 3802
Processing: 3803
Processing: 3804
Processing: 3805
Processing: 3806
Processing: 3807
Processing: 3808
Processing: 3809
Processing: 3810
Processing: 3811
Processing: 3812
Processing: 3813
Processing: 3814
Processing: 3815
Processing: 3816
Processing: 3817
Processing: 3818
Processing: 3819
Processing: 3820
Processing: 3821
Processing: 3822
Processi

In [43]:
get_games_data('hrefs_csv/bad_games.csv', 'lowest_rated_games')

fin
