In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from selenium import webdriver

import random
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
def getHtml(url):
    driver = webdriver.Edge(executable_path = 'C:\\Users\\Home\\Documents\\msedgedriver.exe') # используем Selenium
    driver.get(url)

    SCROLL_PAUSE_TIME = 2
    try:
        
        lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
        match=False
        while(match==False):
            lastCount = lenOfPage
            delay = random.randint(10, 30) / 10
            time.sleep(delay)
            lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
            if lastCount==lenOfPage:
                match=True
        
        
    except Exception as e:
        print(e)

    time.sleep(1)
    html = driver.page_source
    soup = BeautifulSoup(html,'html.parser')
    driver.close()
    rows = soup.find('div', id="search_resultsRows")
    return rows

In [32]:
def findLabels(href):
    labels = []
    temp = requests.post(href)
    soup = BeautifulSoup(temp.text, 'html.parser')
    page = soup.find('div', class_="glance_tags_ctn popular_tags_ctn")
    i = 0
    for label in page.find_all('a'):
        i += 1
        if (i > 3):
            return labels
        else:
            labels.append(label.text.split('\t\t\t\t\t\t\t\t\t\t\t\t')[1])

In [46]:
def getData(game_type, page):
    result = []
    i = 0

    for game in page.find_all('a'):

        try:
            title = game.find('span', class_="title").text
            price = -1
            release_date = ''
            rating = ''
            review_type = ''
            labels = ['', '', '']
            
            href = game.attrs.get('href')

            labels = findLabels(href)
            price = int(game.find('div', class_="col search_price_discount_combined responsive_secondrow").attrs.get('data-price-final'))
            release_date = game.find('div', class_="col search_released responsive_secondrow").text

            review_data = game.find('span', class_="search_review_summary positive").attrs.get('data-tooltip-html').split('<br>')

            review_type = review_data[0]
            rating = review_data[1].split(' ')[0]

        except Exception as e:
            pass
        finally:
            result.append([title, price, game_type, release_date, review_type, rating, labels[0], labels[1], labels[2]])
                    
    return result

## Сделаем запрос по тегу "Инди":

In [45]:
#indie_page = getHtml('https://store.steampowered.com/search/?ignore_preferences=1&tags=492&filter=topsellers')
indie_page = getHtml('https://store.steampowered.com/search/?sort_by=_ASC&tags=492&filter=topsellers')

In [47]:
indie_games = getData("indie", indie_page)

TypeError: 'NoneType' object is not subscriptable

## Затем запрос, исключающий этот тег:

In [48]:
AAA_page = getHtml('https://store.steampowered.com/search/?ignore_preferences=1&untags=492&filter=topsellers')

In [49]:
AAA_games = getData("AAA", AAA_page)

TypeError: 'NoneType' object is not subscriptable

# Сформируем таблицу в pandas

In [None]:
cols = ['Название', 'Цена', 'Тип', 'Дата выхода', 'Отзывы', 'Рейтинг', 'Метки']

all_games = indie_games + AAA_games
df = pd.DataFrame(all_games, columns=cols)
df

## Устраним пустоты в полученных данных 

In [None]:
df = df.convert_dtypes()
df['Цена'] = df['Цена'].astype(float)
df['Цена'] = df['Цена'] / 100

In [None]:
d1 = pd.to_datetime(df['Дата выхода'], format="%d %b, %Y", errors='coerce')
d2 = pd.to_datetime(df['Дата выхода'], format="%b %Y", errors='coerce')

df['Дата выхода'] = d1.combine_first(d2)

In [None]:
df[df['Рейтинг'] == ''] 
df

## Анализируем данные

### 1. Распределение по типам

In [None]:
plt.pie(df['Тип'].value_counts(), labels = df['Тип'].unique(), autopct='%.0f%%')

## 1. Цены

### 1.1 Средняя цена игры

In [None]:
df[['Цена']].apply(np.median)

### 1.2. Зависимость между типом и ценой

In [None]:
sns.boxplot(x ='Тип', y ='Цена', data = df.loc[df['Цена']<df['Цена'].quantile(0.95)])

### 1.3 Самые дорогие игры 

In [None]:
df.sort_values(by = 'Цена', ascending = False).head(20)

## 2. Отзывы

In [None]:
df['Отзывы'].value_counts()

In [None]:
df[df['Отзывы'] == '']

In [None]:
sns.set(rc={'figure.figsize':(10,6)})
sns.countplot(df['Отзывы'])

In [None]:
sns.countplot(x='Отзывы', hue='Тип',data = df)

### Самый высокий рейтинг

In [None]:
df.sort_values(by = 'Рейтинг', ascending = False).head(20)

In [None]:
df['Рейтинг'] = (df['Рейтинг'].str.split('%').str[0]).astype(int)

sns.distplot(df['Рейтинг'])

## 3. Жанры