In [1]:
import requests, re, os
import pandas as pd
import numpy as np
import yaml

from bs4 import BeautifulSoup
from sqlalchemy import create_engine

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel,cosine_similarity
from sklearn.cluster import KMeans

from pyspark.ml.recommendation import ALS
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
user = 'root'
password = 'jenny3248'
host = '127.0.0.1'
db_name = 'steam'
engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}/{db_name}?charset=utf8mb4')

In [7]:
# https://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity
df_game_descr = pd.read_sql_query(
    '''
        SELECT 
            app_id, 
            short_description 
        FROM steam_app_details 
        WHERE short_description IS NOT NULL
        AND type = "game" 
        AND name IS NOT NULL
        AND release_date <= CURDATE() 
        AND initial_price IS NOT NULL
    ''', engine)

tfidf = TfidfVectorizer(strip_accents='unicode', stop_words='english').fit_transform(df_game_descr['short_description'].tolist())
app_ids = df_game_descr['app_id'].tolist()
dict_content_rec = {}

for row_index in range(10):
    cosine_similarities = linear_kernel(tfidf[row_index:row_index+1], tfidf).flatten()
    # indices of top 100 most similar games
    related_games = cosine_similarities.argsort()[-2:-102:-1]
    dict_content_rec.update({app_ids[row_index]:[app_ids[i] for i in related_games]})
df_content_rec = pd.DataFrame.from_dict(dict_content_rec, 'index')
df_content_rec.index.name = 'app_id'
df_content_rec.reset_index(inplace=True)

In [8]:
df_content_rec

Unnamed: 0,app_id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,1494550,351290,1196350,1342480,893450,269530,1274490,242680,1055460,870160,...,943950,557780,690350,346920,372750,1212110,841140,397020,671600,495420
1,1493310,798380,232050,853760,1151650,1300190,1126450,660900,448810,1152330,...,527760,409070,1431690,1206900,466770,511610,1101260,647050,325090,504110
2,1493400,1388150,889450,1210830,1371110,585690,867730,586340,873530,453650,...,390620,1188000,1051990,991230,1229730,1206870,809470,898250,1074610,897360
3,1493410,705070,982180,381000,450120,644570,1177070,1355140,1225920,777580,...,993650,1380470,912820,1383730,1038120,1221430,807910,1241370,1353670,947270
4,1492930,1487360,1164050,1175970,1477870,1087060,1477900,388800,1229200,874370,...,803280,553000,1107580,1299710,849740,751980,781950,848100,1011240,466100
5,1491680,926070,552630,302690,538100,863380,816920,945490,970560,697730,...,730310,712530,1073900,328430,903680,1033860,851820,664750,671260,1074690
6,1491880,364720,952020,952200,952140,952130,952120,952090,952050,952040,...,950210,950160,950140,950130,950050,949970,949960,949930,949910,949890
7,1490790,1192500,1480010,1020660,920770,765400,1199780,561680,1202710,590600,...,1201880,709340,850970,988640,1246140,854680,692830,1249410,1352480,685970
8,1488960,963940,884240,766000,1310050,1355240,360380,967410,358070,878140,...,1263410,720640,822770,325110,538040,1224020,400630,489080,830710,582290
9,1489060,981480,1164020,282590,1211790,723350,1443360,1081340,661820,688780,...,376730,544820,942520,461940,903560,867760,427970,900490,350480,342980
