In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from tqdm import tqdm_notebook
import time

Only want seller info but getting it through game iterations. For every game find seller and get their rating history. If seller already in database skip game. Keep going untill all games on market have seller information collected.

In [2]:
def get_soup(url):
    '''
    gets a soup from the www.g2a.com website
    '''
    header={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
           'referer':'https://www.g2a.com/en/category/games-c189?sort=newest-first&drm%5B5%5D=1',
           'authority':'www.g2a.com',
           'method':'GET',
           'path':'/en/',
           'scheme':'https',
           'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
           'accept-encoding':'gzip, deflate, br',
           'accept-language':'en-GB,en-US;q=0.9,en;q=0.8'}
    r = requests.get(url, headers=header)
    soup = BeautifulSoup(r.text)
    return soup

def find_games(soup):
    '''
    finds all games on page
    returns list
    '''
    games=soup.find('ul', class_='products-grid').findAll('li', class_='products-grid__item')
    return games

def get_game_url(game):
    '''
    from the game_card soup
    gets the url of the game
    returns full url
    '''
    url_start='https://www.g2a.com'
    url_end=game.find('h3', class_='Card__title').find('a')['href']
    return url_start+url_end

def get_seller_history(game_url, length, sellers):
    '''
    with game_url opens remote chrome
    bypasses cookie prompt
    clicks on seller history
    collects seller history soup for bs4 data collection
    returns soup
    '''#initilise remote chrome
    options = Options()
    options.add_argument('user=agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15')
    driver = webdriver.Chrome(options = options)
    driver.implicitly_wait(10)
    driver.get(game_url)

    #ignore cookies prompt
    maybe_later1='/html/body/div[31]/div/div/div/div/div[3]/button[2]'
    maybe_later2='/html/body/div[30]/div/div/div/div/div[3]/button[2]'
    maybe_later3='/html/body/div[34]/div/div/div/div/div[3]/button[2]'
    maybe_later4='/html/body/div[32]/div/div/div/div/div[3]/button[2]'
    try:
        maybe_later=driver.find_element_by_xpath(maybe_later1)
    except:
        try:
            maybe_later=driver.find_element_by_xpath(maybe_later2)
        except:
            try:
                maybe_later=driver.find_element_by_xpath(maybe_later3)
            except:
                maybe_later=driver.find_element_by_xpath(maybe_later4)
    maybe_later.click()
    
    #check if seller is already in database
    time.sleep(1)
    game_soup=BeautifulSoup(driver.page_source)
    seller = game_soup.find('span', class_='seller-info__user').text
    if seller in sellers:
        skip = True
        return np.nan, skip
    else:
        skip = False

    #get seller history page soup
    entry_path='//*[@id="app"]/div/div[2]/div/article/header/div/div[3]/div[1]/div[1]/button/span[1]'
    seller_button=driver.find_element_by_xpath(entry_path)
    seller_button.click()
    
    
    load_more(driver, times=int(length/10))
    seller_r=driver.page_source
    driver.quit()
    return BeautifulSoup(seller_r), skip

def load_more(driver, times):
    button_location1='/html/body/div[5]/div/div/div/div/div[3]/div/div/div/div/button'
    button_location2='/html/body/div[6]/div/div/div/div/div[3]/div/div/div/div/button'
    for t in range(times):
        try:
            button=driver.find_element_by_xpath(button_location1)
        except:
            try:
                button=driver.find_element_by_xpath(button_location2)
            except:
                #button not there because on last page
                pass
        
        try:
            button.click()
        except:
            try:
                time.sleep(1)
                button.click()
            except:
                time.sleep(1)
                button.click()
            
        
        
def data_collection_from_game(seller_soup):
    '''
    once the html is generated it can then be scraped
    this collection iterates over comments collecting
    the rating, date, desc and comment per buyer aswell as
    seller name and overall rating per seller. creating a single
    dictionary with keys of equal length.
    '''
    seller=seller_soup.find('div', class_='user-name').find('strong').text
    ovrall_rating=seller_soup.find('div', class_='rating-data').text
    
    comments=seller_soup.find('div', class_='comments').findAll('li')
    
    game_data={
    'seller':[],
    'ovrall_rating':[],
    'rating':[],
    'date':[],
    'desc':[],
    'comment':[]
    }
    
    for comment in comments:
        game_data['seller'].append(seller)
        game_data['ovrall_rating'].append(ovrall_rating)
        
        try:
            game_data['rating'].append(comment.find('use')['xlink:href'])
        except:
            game_data['rating'].append(np.nan)
        try:
            game_data['date'].append(comment.find('span', class_='date').text)
        except:
            game_data['date'].append(np.nan)
        try:
            game_data['desc'].append(comment.find('p', class_='comments__item-type').text)
        except:
            game_data['desc'].append(np.nan)
        try:
            game_data['comment'].append(comment.find('div', class_='comments__item-content').text)
        except:
            game_data['comment'].append(np.nan)
    
    return game_data

def data_collection_from_site(games, length):
    '''
    iterates over games. If seller already in dictionary
    then the game is missed as it will have less up to date
    comments than the previous one. 
    '''
    
    site_data={
    'seller':[],
    'ovrall_rating':[],
    'rating':[],
    'date':[],
    'desc':[],
    'comment':[]
    }
    
    for game in tqdm_notebook(games):
        try:
            game_url=get_game_url(game)
        except:
            done=False
            while done==False:
                try:
                    time.sleep(5)
                    game_url=get_game_url(game)
                    done=True
                except:
                    continue
                    
                    
        try:
            seller_soup, skip =get_seller_history(game_url, length=length, sellers=site_data['seller'])
        except:
            done=False
            while done==False:
                try:
                    time.sleep(5)
                    seller_soup, skip =get_seller_history(game_url, length=length, sellers=site_data['seller'])
                    done=True
                except:
                    continue
            
        
        if skip == False:
            try:
                game_data=data_collection_from_game(seller_soup)
                for key in site_data.keys():
                    site_data[key].extend(game_data[key])
            except:
                done=False
                while done==False:
                    try:
                        time.sleep(5)
                        game_data=data_collection_from_game(seller_soup)
                        for key in site_data.keys():
                            site_data[key].extend(game_data[key])
                        done=True
                    except:
                        continue
            
        else:
            continue
    
    return site_data

def g2a_spider(revs_per_seller, pages):
    site_data={
        'seller':[],
        'ovrall_rating':[],
        'rating':[],
        'date':[],
        'desc':[],
        'comment':[]
        }
    for page in range(pages):
        url=f'https://www.g2a.com/category/games-c189?drm%5B5%5D={page+1}'
        try:
            site_soup=get_soup(url)
        except:
            done=False
            while done==False:
                try:
                    time.sleep(5)
                    site_soup=get_soup(url)
                    done=True
                except:
                    continue
                    
        try:
            games=find_games(site_soup)
        except:
            done=False
            while done==False:
                try:
                    time.sleep(5)
                    games=find_games(site_soup)
                    done=True
                except:
                    continue
                    
        try:
            page_data=data_collection_from_site(games, length=revs_per_seller)
            for key in site_data.keys():
                site_data[key].extend(page_data[key])
        except:
            done=False
            while done==False:
                try:
                    time.sleep(5)
                    game_data=data_collection_from_site(games, length=revs_per_seller)
                    for key in site_data.keys():
                        site_data[key].extend(game_data[key])
                    done=True
                except:
                    continue
                    
        data=pd.DataFrame(site_data)
        data.to_csv('save_for_crash.csv')
            
        
    return site_data

In [None]:
data=g2a_spider(url=revs_per_seller=10000, pages=100)

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))

In [None]:
data=pd.DataFrame(data)

In [None]:
data['seller'].value_counts()

In [2]:
saved = pd.read_csv('save_for_crash.csv')

In [4]:
saved['seller'].value_counts()

World_of_games    30
Gamingimperium    30
Kgamestrade       30
Bellakey          30
Andariel          30
Gamescode         30
Atonce            30
Name: seller, dtype: int64

In [11]:
common=saved['desc'].value_counts().index[0]

In [13]:
saved[saved['desc']==common]['seller']

61     Kgamestrade
69     Kgamestrade
70     Kgamestrade
71     Kgamestrade
72     Kgamestrade
75     Kgamestrade
79     Kgamestrade
81     Kgamestrade
82     Kgamestrade
88     Kgamestrade
130         Atonce
137         Atonce
Name: seller, dtype: object

In [14]:
saved[saved['seller']=='Kgamestrade'][ ]

Unnamed: 0.1,Unnamed: 0,seller,ovrall_rating,rating,date,desc,comment
60,60,Kgamestrade,95%Positive feedback|7001,#icon-positive,"Jul 09, 2019",User bought DiRT Rally 2.0 + Preorder Bonus St...,
61,61,Kgamestrade,95%Positive feedback|7001,#icon-positive,"Jul 09, 2019",User bought F1 2019 Anniversary Edition Steam ...,
62,62,Kgamestrade,95%Positive feedback|7001,#icon-positive,"Jul 09, 2019",User bought The Sims 4: Island Living Origin K...,
63,63,Kgamestrade,95%Positive feedback|7001,#icon-positive,"Jul 09, 2019",User bought Sid Meier's Civilization VI: Gathe...,
64,64,Kgamestrade,95%Positive feedback|7001,#icon-positive,"Jul 09, 2019",User bought Battlefield 1 Origin Key GLOBAL,Excellent seller!
65,65,Kgamestrade,95%Positive feedback|7001,#icon-positive,"Jul 09, 2019",User bought Battlefield 1 Origin Key GLOBAL,
66,66,Kgamestrade,95%Positive feedback|7001,#icon-positive,"Jul 08, 2019",User bought Battlefield 1 Origin Key GLOBAL,work perfect :)
67,67,Kgamestrade,95%Positive feedback|7001,#icon-positive,"Jul 08, 2019",User bought RESIDENT EVIL 2 / BIOHAZARD RE:2 S...,
68,68,Kgamestrade,95%Positive feedback|7001,#icon-positive,"Jul 08, 2019",User bought Battlefield 1 Origin Key GLOBAL,
69,69,Kgamestrade,95%Positive feedback|7001,#icon-positive,"Jul 08, 2019",User bought F1 2019 Anniversary Edition Steam ...,
