In [35]:
import requests, re, os
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from sqlalchemy import create_engine

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import os
from dotenv import load_dotenv
load_dotenv()

True

In [36]:
user = os.getenv('DB_USER')
password = os.getenv('DB_PASS')
host = os.getenv('DB_HOST')
db_name = os.getenv('DATABASE')
engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}/{db_name}?charset=utf8mb4')

## Popularity Based

In [None]:
url = 'https://store.steampowered.com/stats'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')

dict_popularity_rec = {}

# located in div "Top games by current player count"
for i in soup.find('div', {'id':'detailStats'}).find_all('tr', {'class':'player_count_row'}):
    data = i.find_all('td')
    # number of current players
    curr_player = int(data[0].span.string.replace(',',''))
    # peak number of players today
    peak_today = int(data[1].span.string.replace(',',''))
    # last td is link to game
    app_id = re.findall(r'(\d+)', data[-1].a.get('href'))[0]
    dict_popularity_rec[app_id] = {'current_player' : curr_player, 'peak_today' : peak_today}

df_popularity_rec = pd.DataFrame.from_dict(dict_popularity_rec, 'index')
df_popularity_rec.index.name = 'app_id'
df_popularity_rec.reset_index(inplace=True)
df_popularity_rec.to_sql('popularity_recommendation', engine, if_exists='replace', index = False)

## Content Based

In [None]:
# https://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity
df_game_descr = pd.read_sql_query(
    '''
        SELECT 
            app_id, 
            short_description 
        FROM steam_app_details 
        WHERE short_description IS NOT NULL
        AND type = "game" 
        AND name IS NOT NULL
        AND release_date <= CURDATE() 
        AND initial_price IS NOT NULL
    ''', engine)

tfidf = TfidfVectorizer(strip_accents='unicode', stop_words='english').fit_transform(df_game_descr['short_description'].tolist())
app_ids = df_game_descr['app_id'].tolist()
dict_content_rec = {}

for row_index in range(tfidf.shape[0]):
    cosine_similarities = linear_kernel(tfidf[row_index:row_index+1], tfidf).flatten()
    # indices of top 100 most similar games
    related_games = cosine_similarities.argsort()[-2:-102:-1]
    dict_content_rec.update({app_ids[row_index]:[app_ids[i] for i in related_games]})

# df_content_rec = pd.DataFrame.from_dict(dict_content_rec, 'index')
# df_content_rec.index.name = 'app_id'
# df_content_rec.reset_index(inplace=True)
# df_content_rec.to_sql('recommended_games_content_based',engine,if_exists='replace', chunksize = 1000, index = False)

## Item Based

In [None]:
df_purchase = pd.read_sql_query(
    '''
    SELECT app_id, user_id         
    FROM steam_owned_games
    WHERE playtime_forever > 15
    ''', engine).pivot_table(values = 'user_id', index = ['app_id'], columns = ['user_id'], aggfunc = len, fill_value = 0)

purchase_matrix = df_purchase.values
app_ids = df_purchase.index
dict_item_rec = {}

for index in range(purchase_matrix.shape[0]):
    cosine_similarities = linear_kernel(purchase_matrix[index:index+1], purchase_matrix).flatten()
    related_purchases = np.argsort(-cosine_similarities)[1:101]
    dict_item_rec.update({app_ids[index]:[app_ids[i] for i in related_purchases]})

df_item_based_result = pd.DataFrame.from_dict(dict_item_rec, 'index')
df_item_based_result.index.name = 'app_id'
df_item_based_result.reset_index(inplace=True)
df_item_based_result.to_sql('recommended_games_item_based', engine, if_exists='replace', chunksize = 1000, index = False)