# In this notebook we are going to generate the two machine learning models requested:

- Recommend 5 similar games given the game id.
- Recomment 5 games to a user based on the games liked by similar users.

## Similar game recommendation

### Import required libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn as skl

### Prepare Dataset

In [2]:
df_steamGames = pd.read_csv('APIData/df_steamGames.csv')
df_steamGames

Unnamed: 0,genres,app_name,release_date,price,id,developer
0,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",Lost Summoner Kitty,2018-01-04,4.99,761140.0,Kotoshiro
1,"['Free to Play', 'Indie', 'RPG', 'Strategy']",Ironbound,2018-01-04,0.00,643980.0,Secret Level SRL
2,"['Casual', 'Free to Play', 'Indie', 'Simulatio...",Real Pool 3D - Poolians,2017-07-24,0.00,670290.0,Poolians.com
3,"['Action', 'Adventure', 'Casual']",弹炸人2222,2017-12-07,0.99,767400.0,彼岸领域
4,"['Action', 'Indie', 'Casual', 'Sports']",Log Challenge,2016-04-21,2.99,773570.0,
...,...,...,...,...,...,...
32126,"['Casual', 'Indie', 'Simulation', 'Strategy']",Colony On Mars,2018-01-04,1.99,773640.0,"Nikita ""Ghost_RUS"""
32127,"['Casual', 'Indie', 'Strategy']",LOGistICAL: South Africa,2018-01-04,4.99,733530.0,Sacada
32128,"['Indie', 'Racing', 'Simulation']",Russian Roads,2018-01-04,1.99,610660.0,Laush Dmitriy Sergeevich
32129,"['Casual', 'Indie']",EXIT 2 - Directions,2017-09-02,4.99,658870.0,"xropi,stev3ns"


#### Create a new column that gives the average forever time played by user.

In [3]:
df_userItems = pd.read_csv('APIData/df_userItems.csv')
SIDs = df_userItems['steam_id'].values
df_userItems_r = pd.DataFrame()
for sid in SIDs:
    fn_userItems_i = 'APIData/ItemsData/itemsData_'+str(sid)+'.csv'
    try:
        df_userItems_i = pd.read_csv(fn_userItems_i)
        df_userItems_i.drop(columns=['item_name','playtime_2weeks'],inplace=True)
        df_userItems_r = pd.concat([df_userItems_r,df_userItems_i],ignore_index=True)
    except:
        pass
ids = df_steamGames['id'].values
TPU = []
for id in ids:
    df_userItems_r_id = df_userItems_r[df_userItems_r['item_id'] == id]
    n = len(df_userItems_r_id)
    if n>0:
        s = df_userItems_r_id['playtime_forever'].sum()
        TPU.append(s/n)
    else:
        TPU.append(0)
df_steamGames['tpu'] = TPU

#### Replace null values and remove and save `id` column

In genres column we will replace nan with [] and in developer we will replace with "unknown"

In [4]:
df_steamGames['genres'].fillna("[]",inplace=True)
df_steamGames['developer'].fillna("unknown",inplace=True)
id_games = df_steamGames['id'].values
df_steamGames.drop(columns='id',inplace=True)
df_steamGames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32131 entries, 0 to 32130
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   genres        32131 non-null  object 
 1   app_name      32131 non-null  object 
 2   release_date  32131 non-null  object 
 3   price         32131 non-null  float64
 4   developer     32131 non-null  object 
 5   tpu           32131 non-null  float64
dtypes: float64(2), object(4)
memory usage: 1.5+ MB


#### Convert `release_date` column to integer taking days since the release date of the first game in this data base

In [5]:
dateTimeSeries = pd.to_datetime(df_steamGames['release_date'])
df_steamGames['days'] = [i.days for i in dateTimeSeries - dateTimeSeries.min()]
df_steamGames.drop(columns='release_date',inplace=True)
df_steamGames

Unnamed: 0,genres,app_name,price,developer,tpu,days
0,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",Lost Summoner Kitty,4.99,Kotoshiro,0.0,17340
1,"['Free to Play', 'Indie', 'RPG', 'Strategy']",Ironbound,0.00,Secret Level SRL,0.0,17340
2,"['Casual', 'Free to Play', 'Indie', 'Simulatio...",Real Pool 3D - Poolians,0.00,Poolians.com,0.0,17176
3,"['Action', 'Adventure', 'Casual']",弹炸人2222,0.99,彼岸领域,0.0,17312
4,"['Action', 'Indie', 'Casual', 'Sports']",Log Challenge,2.99,unknown,0.0,16717
...,...,...,...,...,...,...
32126,"['Casual', 'Indie', 'Simulation', 'Strategy']",Colony On Mars,1.99,"Nikita ""Ghost_RUS""",0.0,17340
32127,"['Casual', 'Indie', 'Strategy']",LOGistICAL: South Africa,4.99,Sacada,0.0,17340
32128,"['Indie', 'Racing', 'Simulation']",Russian Roads,1.99,Laush Dmitriy Sergeevich,0.0,17340
32129,"['Casual', 'Indie']",EXIT 2 - Directions,4.99,"xropi,stev3ns",0.0,17216


#### Preprocess text in `app_name` and `developer` columns

In [6]:
import nltk
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

porter_stemmer = PorterStemmer()

def textpp(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    stemmed_words = [porter_stemmer.stem(word) for word in filtered_words]
    return " ".join(stemmed_words)

app_names = df_steamGames['app_name'].values
app_names_pp = []
for text in app_names:
    app_names_pp.append(textpp(text))

developer = df_steamGames['developer'].values
developer_pp = []
for text in developer:
    developer_pp.append(textpp(text))

df_steamGames['app_name'] = app_names_pp
df_steamGames['developer'] = developer_pp
    


In [7]:
df_steamGames

Unnamed: 0,genres,app_name,price,developer,tpu,days
0,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",lost summon kitti,4.99,kotoshiro,0.0,17340
1,"['Free to Play', 'Indie', 'RPG', 'Strategy']",ironbound,0.00,secret level srl,0.0,17340
2,"['Casual', 'Free to Play', 'Indie', 'Simulatio...",real pool 3d poolian,0.00,poolianscom,0.0,17176
3,"['Action', 'Adventure', 'Casual']",弹炸人2222,0.99,彼岸领域,0.0,17312
4,"['Action', 'Indie', 'Casual', 'Sports']",log challeng,2.99,unknown,0.0,16717
...,...,...,...,...,...,...
32126,"['Casual', 'Indie', 'Simulation', 'Strategy']",coloni mar,1.99,nikita ghostru,0.0,17340
32127,"['Casual', 'Indie', 'Strategy']",logist south africa,4.99,sacada,0.0,17340
32128,"['Indie', 'Racing', 'Simulation']",russian road,1.99,laush dmitriy sergeevich,0.0,17340
32129,"['Casual', 'Indie']",exit 2 direct,4.99,xropistev3n,0.0,17216


#### Convert `genres` column to dummy variables

In [8]:
import ast
import numpy as np

def Union(lst1, lst2):
    final_list = list(set(lst1) | set(lst2))
    return final_list

Genres = []
genresSeries = df_steamGames['genres'].values
for g in genresSeries:
    try:
        gl = ast.literal_eval(g)
        Genres = Union(Genres,gl)
    except:
        pass

def genreRow(genresListText,GenresList):
    row = np.zeros((1,len(GenresList)))
    df_row = pd.DataFrame(row,columns=GenresList)
    genresList = ast.literal_eval(genresListText)
    for genre in genresList:
        df_row.loc[0,genre] = 1
    return(df_row)

df_dummyGenres = pd.DataFrame()
for g in genresSeries:
    df_dummyGenres = pd.concat([df_dummyGenres,genreRow(g,Genres)],ignore_index=True)

df_dummyGenres


Unnamed: 0,Artificial Intelligence,Dark Fantasy,Education,Stylized,Mars,Gore,Sandbox,Mini Golf,Colorful,Golf,...,Robots,Post-apocalyptic,Shoot 'Em Up,Local Co-Op,Trains,Martial Arts,Lemmings,Space Sim,Building,Indie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
32127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
32128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
32129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
df_steamGames = pd.concat([df_steamGames,df_dummyGenres],axis=1)
df_steamGames.drop(columns='genres',inplace=True)
df_steamGames

Unnamed: 0,app_name,price,developer,tpu,days,Artificial Intelligence,Dark Fantasy,Education,Stylized,Mars,...,Robots,Post-apocalyptic,Shoot 'Em Up,Local Co-Op,Trains,Martial Arts,Lemmings,Space Sim,Building,Indie
0,lost summon kitti,4.99,kotoshiro,0.0,17340,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,ironbound,0.00,secret level srl,0.0,17340,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,real pool 3d poolian,0.00,poolianscom,0.0,17176,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,弹炸人2222,0.99,彼岸领域,0.0,17312,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,log challeng,2.99,unknown,0.0,16717,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32126,coloni mar,1.99,nikita ghostru,0.0,17340,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
32127,logist south africa,4.99,sacada,0.0,17340,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
32128,russian road,1.99,laush dmitriy sergeevich,0.0,17340,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
32129,exit 2 direct,4.99,xropistev3n,0.0,17216,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Model Pipeline

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler # For columns 'price', 'tpu' and 'days'
from sklearn.feature_extraction.text import TfidfVectorizer # For columns 'app_name' and 'developer'
from sklearn.metrics.pairwise import cosine_similarity # Model

# initialise scaler and vectorizers
scalerPrice = MinMaxScaler()
scalerTPU = MinMaxScaler()
scalerDays = MinMaxScaler()
vectAppName = TfidfVectorizer()
vectDev = TfidfVectorizer()


# construct the column transfomer
column_transformer = ColumnTransformer(
    [('scalerPrice', scalerPrice, ['price']),
    ('scalerTPU', scalerTPU, ['tpu']),
    ('scalerDays', scalerDays, ['days']),
    ('vectAppName', vectAppName, 'app_name'),
    ('vectDev', vectDev, 'developer')],
    remainder='passthrough')

# transform dataframe
df_steamGames_trans = column_transformer.fit_transform(df_steamGames)

# calculate cosine similarity
cosine_sim = cosine_similarity(df_steamGames_trans)
df_cosineSim = pd.DataFrame(cosine_sim)
df_cosineSim



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32121,32122,32123,32124,32125,32126,32127,32128,32129,32130
0,1.000000,0.388737,0.489984,0.420543,0.523219,0.420649,0.388741,0.388739,0.388732,0.525091,...,0.421643,0.461974,0.299939,0.461976,0.525090,0.661441,0.568194,0.420648,0.461568,0.459867
1,0.388737,1.000000,0.388036,0.134224,0.266887,0.134409,0.416139,0.416131,0.416115,0.270177,...,0.293778,0.321078,0.494531,0.321078,0.270178,0.416142,0.450301,0.292356,0.320423,0.234571
2,0.489984,0.388036,1.000000,0.272056,0.522689,0.272193,0.661014,0.661001,0.660975,0.524526,...,0.273508,0.461141,0.298928,0.461140,0.388036,0.524527,0.419888,0.419888,0.460738,0.338906
3,0.420543,0.134224,0.272056,1.000000,0.447333,0.487145,0.134225,0.134223,0.134220,0.292208,...,0.488293,0.347258,0.159511,0.347258,0.608174,0.292207,0.316194,0.145242,0.346551,0.393168
4,0.523219,0.266887,0.522689,0.447333,1.000000,0.288796,0.413497,0.413493,0.413481,0.413497,...,0.448443,0.491397,0.317168,0.491398,0.560104,0.413496,0.447439,0.288796,0.490990,0.490642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32126,0.661441,0.416142,0.524527,0.292207,0.413496,0.292357,0.416142,0.416137,0.416123,0.562107,...,0.293780,0.494542,0.321076,0.494542,0.416143,1.000000,0.608248,0.450302,0.494105,0.363428
32127,0.568194,0.450301,0.419888,0.316194,0.447439,0.145446,0.292361,0.292362,0.292360,0.450304,...,0.317897,0.535137,0.347440,0.535140,0.450302,0.608248,1.000000,0.316355,0.534666,0.253829
32128,0.420648,0.292356,0.419888,0.145242,0.288796,0.316355,0.450302,0.450296,0.450281,0.608248,...,0.317895,0.347435,0.347432,0.347436,0.292356,0.450302,0.316355,1.000000,0.346727,0.393261
32129,0.461568,0.320423,0.460738,0.346551,0.490990,0.158791,0.320429,0.320431,0.320428,0.494107,...,0.348407,0.587192,0.380796,0.587195,0.494105,0.494105,0.534666,0.346727,1.000000,0.278216


In [11]:
# Save matrix
#df_cosineSim.to_csv('APIData/MLData/df_cosineSim_gR.csv')

In [12]:
df_cosineSim_triu = df_cosineSim.where(np.triu(np.ones((32131,32131)),1)==1)
df_cosineSim_triu

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32121,32122,32123,32124,32125,32126,32127,32128,32129,32130
0,,0.388737,0.489984,0.420543,0.523219,0.420649,0.388741,0.388739,0.388732,0.525091,...,0.421643,0.461974,0.299939,0.461976,0.525090,0.661441,0.568194,0.420648,0.461568,0.459867
1,,,0.388036,0.134224,0.266887,0.134409,0.416139,0.416131,0.416115,0.270177,...,0.293778,0.321078,0.494531,0.321078,0.270178,0.416142,0.450301,0.292356,0.320423,0.234571
2,,,,0.272056,0.522689,0.272193,0.661014,0.661001,0.660975,0.524526,...,0.273508,0.461141,0.298928,0.461140,0.388036,0.524527,0.419888,0.419888,0.460738,0.338906
3,,,,,0.447333,0.487145,0.134225,0.134223,0.134220,0.292208,...,0.488293,0.347258,0.159511,0.347258,0.608174,0.292207,0.316194,0.145242,0.346551,0.393168
4,,,,,,0.288796,0.413497,0.413493,0.413481,0.413497,...,0.448443,0.491397,0.317168,0.491398,0.560104,0.413496,0.447439,0.288796,0.490990,0.490642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32126,,,,,,,,,,,...,,,,,,,0.608248,0.450302,0.494105,0.363428
32127,,,,,,,,,,,...,,,,,,,,0.316355,0.534666,0.253829
32128,,,,,,,,,,,...,,,,,,,,,0.346727,0.393261
32129,,,,,,,,,,,...,,,,,,,,,,0.278216


In [13]:
df_cosineSim_triu.to_csv('APIData/MLData/df_cosineSim_gR.csv')

In [14]:
df_cosineSim_triu

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32121,32122,32123,32124,32125,32126,32127,32128,32129,32130
0,,0.388737,0.489984,0.420543,0.523219,0.420649,0.388741,0.388739,0.388732,0.525091,...,0.421643,0.461974,0.299939,0.461976,0.525090,0.661441,0.568194,0.420648,0.461568,0.459867
1,,,0.388036,0.134224,0.266887,0.134409,0.416139,0.416131,0.416115,0.270177,...,0.293778,0.321078,0.494531,0.321078,0.270178,0.416142,0.450301,0.292356,0.320423,0.234571
2,,,,0.272056,0.522689,0.272193,0.661014,0.661001,0.660975,0.524526,...,0.273508,0.461141,0.298928,0.461140,0.388036,0.524527,0.419888,0.419888,0.460738,0.338906
3,,,,,0.447333,0.487145,0.134225,0.134223,0.134220,0.292208,...,0.488293,0.347258,0.159511,0.347258,0.608174,0.292207,0.316194,0.145242,0.346551,0.393168
4,,,,,,0.288796,0.413497,0.413493,0.413481,0.413497,...,0.448443,0.491397,0.317168,0.491398,0.560104,0.413496,0.447439,0.288796,0.490990,0.490642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32126,,,,,,,,,,,...,,,,,,,0.608248,0.450302,0.494105,0.363428
32127,,,,,,,,,,,...,,,,,,,,0.316355,0.534666,0.253829
32128,,,,,,,,,,,...,,,,,,,,,0.346727,0.393261
32129,,,,,,,,,,,...,,,,,,,,,,0.278216


In [79]:
def similars(i):
    n = len(df_cosineSim_triu)
    v = range(i)
    h = range(i+1,n)
    vr = df_cosineSim_triu.loc[v,i]
    hr = df_cosineSim_triu.loc[i,h]
    sa = pd.concat([vr,hr])
    top5 = sa.sort_values(ascending=False).iloc[:5]
    top5_i = list(top5.index)

    return(top5_i)

In [83]:
df_top5 = {}
for i in range(len(df_cosineSim_triu)):
    df_top5[i] = similars(i)
df_top5 = pd.DataFrame(df_top5)
df_top5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32121,32122,32123,32124,32125,32126,32127,32128,32129,32130
0,12937,31973,14557,52,11486,15422,8,8,7,13660,...,15206,24553,26989,16383,13831,20875,19078,12781,8198,17253
1,15128,11059,7260,13643,19028,2942,7,6,6,21946,...,24116,22832,11228,16310,15049,12786,11828,18943,12406,5997
2,15198,10848,24722,15529,8974,2773,16,16,16,27966,...,22664,31167,31978,11535,15402,13053,11102,8821,11056,21640
3,15516,6038,12808,16043,21490,2313,18,18,18,13418,...,22663,11535,12993,19261,11448,13029,14228,8393,11535,7395
4,15527,6037,12823,16149,14438,11947,1647,1647,1647,15029,...,22503,19261,13199,13,11589,13075,15472,21469,19261,8835


In [82]:
df_top5.to_csv('APIData/MLData/df_top5.csv',index=False)