# ---- RECOMMENDATION MODEL: ITEM TO ITEM ----

In [1]:
import pandas as pd                                             # ---> Libraries to be used
from sklearn.feature_extraction.text import CountVectorizer     
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df_games = pd.read_csv('./Datasets/processing/games_raw.csv')       # ---> Reading of csv files previously processed with ETL techniques

## 1. Handling of relevant information

In [3]:
df_games = df_games.rename(columns={'id':'item_id'})        # ---> In order to keep consistency 'id' column is renamed to 'item_id'
df_games.head()

Unnamed: 0,app_name,specs,item_id
0,Lost Summoner Kitty,['Single-player'],761140
1,Ironbound,"['Single-player', 'Multi-player', 'Online Mult...",643980
2,Real Pool 3D - Poolians,"['Single-player', 'Multi-player', 'Online Mult...",670290
3,弹炸人2222,['Single-player'],767400
4,Log Challenge,"['Single-player', 'Full controller support', '...",773570


In [4]:
df_games = df_games.iloc[:4000]                             # ---> A sample of 4000 records is picked for running the ML model

In [5]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   app_name  4000 non-null   object
 1   specs     4000 non-null   object
 2   item_id   4000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 93.9+ KB


In [6]:
df_games1 = df_games.loc[:, ['app_name','specs', 'item_id']]   # ---> filter DataFrame with the columns used for development
df_games1.head()

Unnamed: 0,app_name,specs,item_id
0,Lost Summoner Kitty,['Single-player'],761140
1,Ironbound,"['Single-player', 'Multi-player', 'Online Mult...",643980
2,Real Pool 3D - Poolians,"['Single-player', 'Multi-player', 'Online Mult...",670290
3,弹炸人2222,['Single-player'],767400
4,Log Challenge,"['Single-player', 'Full controller support', '...",773570


In [7]:
# ---> Certain special characters are removed from the 'specs' column
df_games1['specs'] = df_games1['specs'].apply(lambda x: str(x).replace('[', '').replace(']', '').replace("'", ''))

In [8]:
df_games1.head()

Unnamed: 0,app_name,specs,item_id
0,Lost Summoner Kitty,Single-player,761140
1,Ironbound,"Single-player, Multi-player, Online Multi-Play...",643980
2,Real Pool 3D - Poolians,"Single-player, Multi-player, Online Multi-Play...",670290
3,弹炸人2222,Single-player,767400
4,Log Challenge,"Single-player, Full controller support, HTC Vi...",773570


In [9]:
df_games1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   app_name  4000 non-null   object
 1   specs     4000 non-null   object
 2   item_id   4000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 93.9+ KB


## 2. Model Development

---> Precepts: The model recommendation to be developed is ITEM by ITEM. The measure to be used to develop the model is the "cosine similarity".

### 2.1 CountVectorizer

---> Objective: Creation of a text vectorizer (CountVectorizer) to convert the text in the 'specs' column into numeric elements (vectors).

---> Process: The vectorizer assigns a number to each unique word present in the text and counts its frequency. Each document (in this case, each set) is represented as a vector where each position corresponds to the frequency of a word.

In [10]:
cv = CountVectorizer()                              # ---> Creation of the text vectorizer
vectors = cv.fit_transform(df_games1['specs']).toarray()

### 2.2 Cosine Similarity

---> Objective: To evaluate the similarity between numerical vectors of different sets.

---> Process: The cosine similarity metric, which measures the cosine of the angle between two vectors, is used. The closer to 1, the more similar the vectors are.

In [11]:
similar = cosine_similarity(vectors)

In [12]:
print(len(similar))

4000


### 2.3 Recommendation Function

Creation of the 'recommendation' function that takes as input the ID of a game, and returns a list of the five (5) most recommended games based on the cosine similarity between the vectors.The function returns the list of recommended titles.

In [13]:
def recommendation(game):
    # ---> Location of the index of the set in the DataFrame. This index is used to access the corresponding row in the similarity matrix.
    game_index = df_games1[df_games1['item_id'] == game].index[0]
    
    # ---> Similarity Calculation. The similarity distances between the input set and the other sets in the data set are calculated.
    distances = similar[game_index]
    
   # ---> The distances are ordered in descending order, and the five (5) games with the highest similarity (excluding the entry game) are selected.
    game_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    # ---> Recommendation: returns the recommended games list.
    recommended_titles = [df_games1.iloc[i[0]]['app_name'] for i in game_list]
    
    return recommended_titles

In [14]:
# ---> Apply the function to the column 'item_id' and create a new column named 'Recommendations'
df_games1['recommendations'] = df_games1['item_id'].apply(recommendation)

In [15]:
# ---> Remove unnecessary columns
df_games1.drop(columns=['app_name', 'specs'], inplace=True)
df_games1.head()

Unnamed: 0,item_id,recommendations
0,761140,"[弹炸人2222, Uncanny Islands, Beach Rules, Planet..."
1,643980,"[SNOW, Space Hulk, BattleLore: Command, Interp..."
2,670290,"[Assassin’s Creed® IV Black Flag™, DEFCON, RAC..."
3,767400,"[弹炸人2222, Uncanny Islands, Beach Rules, Planet..."
4,773570,"[Icarus Six Sixty Six, InMind VR, Panoptic, Af..."


In [16]:
df_games1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   item_id          4000 non-null   int64 
 1   recommendations  4000 non-null   object
dtypes: int64(1), object(1)
memory usage: 62.6+ KB


In [17]:
df_games1.to_csv('./Datasets/ml/item_item_recommendation.csv', encoding='utf-8', index=False)