# Content-based filtering with euclidean distance and count vectorizer

In [1]:
import pandas as pd 
import numpy as np
#cosine similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df_details=pd.read_csv('games_details_dataset.csv')

In [3]:
user_games = [2620, 2630] #example game ids

In [4]:
for i in range(df_details.shape[0]):
    if df_details.iloc[i]["appid"] in user_games:
        print(df_details.iloc[i]["name"])

Call of Duty
Call of Duty 2


In [5]:
def specific_genre(chosen_genre):
    remove_rows = []
    for i in range(df_details.shape[0]):
        if df_details.iloc[i]["genre_new"] != chosen_genre:
            remove_rows.append(i)
    df = df_details.drop(remove_rows, axis=0)
    user_likes = []
    for i in range(df.shape[0]):
        if user_games.count(df.iloc[i]["appid"])>0:
            user_likes.append(i)
    return df, user_likes

In [6]:
def specific_category(chosen_category):
    remove_rows = []
    for i in range(df_details.shape[0]):
        if df_details.iloc[i]["cat_new"] != chosen_category:
            remove_rows.append(i)
    df = df_details.drop(remove_rows, axis=0)
    user_likes = []
    for i in range(df.shape[0]):
        if user_games.count(df.iloc[i]["appid"])>0:
            user_likes.append(i)
    return df, user_likes

In [7]:
def positive_votes():
    remove_rows = []
    for i in range(df_details.shape[0]):
        if df_details.iloc[i]["rating_sign"] != "positive":
            remove_rows.append(i)
    df = df_details.drop(remove_rows, axis=0)
    user_likes = []
    for i in range(df.shape[0]):
        if user_games.count(df.iloc[i]["appid"])>0:
            user_likes.append(i)
    return df, user_likes

In [8]:
def more_playtime():
    remove_rows = []
    for i in range(df_details.shape[0]):
        if df_details.iloc[i]["playtime"] != "more":
            remove_rows.append(i)
    df = df_details.drop(remove_rows, axis=0)
    user_likes = []
    for i in range(df.shape[0]):
        if user_games.count(df.iloc[i]["appid"])>0:
            user_likes.append(i)
    return df, user_likes

In [9]:
df, user_likes = specific_genre("action")
#df, user_likes = specific_category("multiplayer")
#df, user_likes = positive_votes()
#df, user_likes = more_playtime()

In [10]:
for i in range(df.shape[0]):
    if i in user_likes:
        print(df.iloc[i]["name"])

Call of Duty
Call of Duty 2


## Euclidean distance

### 1st approach

We are using only the attributes: genre, categories and steam spy tags for cosine similarity.

In [11]:
def combined_features_one(row):
    return row['genres'].replace(";"," ").lower()+" "+row['categories'].replace(";"," ").lower()+" "+row['steamspy_tags'].replace(";"," ").lower()
df['cossim'] = df.apply(combined_features_one, axis =1)

In [12]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['cossim'])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [13]:
eucl_dist = euclidean_distances(count_matrix)
print(eucl_dist)

[[0.         0.         2.82842712 ... 4.58257569 5.29150262 4.47213595]
 [0.         0.         2.82842712 ... 4.58257569 5.29150262 4.47213595]
 [2.82842712 2.82842712 0.         ... 4.12310563 4.47213595 4.        ]
 ...
 [4.58257569 4.58257569 4.12310563 ... 0.         3.         3.        ]
 [5.29150262 5.29150262 4.47213595 ... 3.         0.         2.82842712]
 [4.47213595 4.47213595 4.         ... 3.         2.82842712 0.        ]]


In [14]:
def find_similar_games(eucl_dist, user_likes, top_k):
    recommended_games = {}
    max_limit_similarity = 0
    max_limit= 0.05
    # repeat until we find enough recommendations
    while len(recommended_games)<20:
        max_limit_similarity+=max_limit
        recommended_games = {}
        for game in user_likes:
            for i in range(len(eucl_dist)):
                if game != i:
                    if eucl_dist[game][i] <= max_limit_similarity:
                        if not (i in user_likes):
                            if i in recommended_games.keys():
                                recommended_games[i][0] +=1
                                if eucl_dist[game][i] < recommended_games[i][1]:
                                    recommended_games[i][1] = eucl_dist[game][i]
                            else:
                                recommended_games[i] = [1, eucl_dist[game][i]]
    recommended_games = {k: v for k, v in sorted(recommended_games.items(), key=lambda item: item[1][0], reverse=True)}
    sort_dict = {}
    lists = []
    list_counter = 0
    frequency = len(eucl_dist) #infinity
    for game, sim_list in recommended_games.items():
        #first list of frequencies
        if frequency == len(eucl_dist):
            frequency = sim_list[0]
            for i in range(frequency):
                lists.append([])
        #sort the list
        if frequency > sim_list[0]:
            frequency = sim_list[0]
            lists[list_counter] = sorted(sort_dict.items(), key=lambda item: item[1][1])
            list_counter += 1
            sort_dict.clear()
        sort_dict[game] = sim_list
    lists[list_counter] = sorted(sort_dict.items(), key=lambda item: item[1][1])
    list_counter += 1
    final_sorted_list = []
    for i in range(list_counter):
        final_sorted_list+=lists[i]
    recommended_games = {k: v for k, v in final_sorted_list}
    counter = 0
    result_list = []
    for game, info in recommended_games.items():
        if counter < top_k:
            counter+=1
            result_list.append(df.iloc[game]["appid"])
    return result_list

In [15]:
recommended_games = find_similar_games(eucl_dist, user_likes, 10)
for game in recommended_games:
    for i in range(df.shape[0]):
        if df.iloc[i]["appid"] == game:
            print(df.iloc[i]["name"])
            break

Call of Duty United Offensive
Medal of Honor Airborne
Delta Force Xtreme 2
Delta Force Task Force Dagger
Delta Force Xtreme
Painkiller Resurrection
Painkiller Overdose
Aliens versus Predator Classic 2000
Project Snowblind
Call of Duty 4 Modern Warfare


### 2nd approach

We will add the attributes publisher and developer and will do the same process .

In [16]:
def combined_features_two(row):
    return row['genres'].replace(";"," ").lower()+" "+row['categories'].replace(";"," ").lower()+" "+row['steamspy_tags'].replace(";"," ").lower()+" "+row['developer'].replace(";"," ").lower()+" "+str(row['publisher']).replace(";"," ").lower()
df['cossim'] = df.apply(combined_features_two, axis =1)

In [17]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['cossim'])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [18]:
eucl_dist = euclidean_distances(count_matrix)
print(eucl_dist)

[[0.         0.         2.82842712 ... 5.38516481 6.         5.09901951]
 [0.         0.         2.82842712 ... 5.38516481 6.         5.09901951]
 [2.82842712 2.82842712 0.         ... 5.         5.29150262 4.69041576]
 ...
 [5.38516481 5.38516481 5.         ... 0.         4.12310563 3.87298335]
 [6.         6.         5.29150262 ... 4.12310563 0.         3.74165739]
 [5.09901951 5.09901951 4.69041576 ... 3.87298335 3.74165739 0.        ]]


In [19]:
recommended_games = find_similar_games(eucl_dist, user_likes, 10)
for game in recommended_games:
    for i in range(df.shape[0]):
        if df.iloc[i]["appid"] == game:
            print(df.iloc[i]["name"])
            break

Call of Duty United Offensive
Call of Duty 4 Modern Warfare
TimeShift
Medal of Honor Airborne
Painkiller Resurrection
Singularity
Painkiller Overdose
Project Snowblind
Call of Duty World at War
Tom Clancy's Rainbow Six Vegas


### 3rd approach

We will add the keywords from the descriptions of the games

In [20]:
def combined_features_three(row):
    return row['genres'].replace(";"," ").lower()+" "+row['categories'].replace(";"," ").lower()+" "+row['steamspy_tags'].replace(";"," ").lower()+" "+row['developer'].replace(";"," ").lower()+" "+str(row['publisher']).replace(";"," ").lower()+" "+' '.join(map(str, row['desc_key_new'])).lower()
df['cossim'] = df.apply(combined_features_three, axis =1)

In [21]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['cossim'])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [22]:
eucl_dist = euclidean_distances(count_matrix)
print(eucl_dist)

[[0.         0.         2.82842712 ... 5.38516481 6.         5.09901951]
 [0.         0.         2.82842712 ... 5.38516481 6.         5.09901951]
 [2.82842712 2.82842712 0.         ... 5.         5.29150262 4.69041576]
 ...
 [5.38516481 5.38516481 5.         ... 0.         4.12310563 3.87298335]
 [6.         6.         5.29150262 ... 4.12310563 0.         3.74165739]
 [5.09901951 5.09901951 4.69041576 ... 3.87298335 3.74165739 0.        ]]


In [23]:
recommended_games = find_similar_games(eucl_dist, user_likes, 10)
for game in recommended_games:
    for i in range(df.shape[0]):
        if df.iloc[i]["appid"] == game:
            print(df.iloc[i]["name"])
            break

Call of Duty United Offensive
Call of Duty 4 Modern Warfare
TimeShift
Medal of Honor Airborne
Painkiller Resurrection
Singularity
Painkiller Overdose
Project Snowblind
Call of Duty World at War
Tom Clancy's Rainbow Six Vegas


### 4th approach

We will use the descriptions of every game (after cleaning) instead of their keywords.

In [24]:
def combined_features_four(row):
    return row['genres'].replace(";"," ").lower()+" "+row['categories'].replace(";"," ").lower()+" "+row['steamspy_tags'].replace(";"," ").lower()+" "+row['developer'].replace(";"," ").lower()+" "+str(row['publisher']).replace(";"," ").lower()+" "+row['new_description'].lower()
df['cossim'] = df.apply(combined_features_four, axis =1)

In [25]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['cossim'])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [26]:
eucl_dist = euclidean_distances(count_matrix)
print(eucl_dist)

[[ 0.          8.71779789  9.16515139 ... 10.34408043  9.53939201
   9.        ]
 [ 8.71779789  0.          9.38083152 ... 10.44030651  9.74679434
   9.11043358]
 [ 9.16515139  9.38083152  0.         ...  9.53939201  8.30662386
   7.93725393]
 ...
 [10.34408043 10.44030651  9.53939201 ...  0.          7.48331477
   7.34846923]
 [ 9.53939201  9.74679434  8.30662386 ...  7.48331477  0.
   5.29150262]
 [ 9.          9.11043358  7.93725393 ...  7.34846923  5.29150262
   0.        ]]


In [27]:
recommended_games = find_similar_games(eucl_dist, user_likes, 10)
for game in recommended_games:
    for i in range(df.shape[0]):
        if df.iloc[i]["appid"] == game:
            print(df.iloc[i]["name"])
            break

Ubersoldier II
Monster Challenge Circus
Alpha Zylon
Eradicator
Dead Space 2
Quake IV
Rogue Shooter The FPS Roguelike
Brawl
Battle Of Europe
Bet On Soldier


The last two approaches has the best results. We put as input 2 call of duty games and we took as a result a list containing call of duty games at the first places of the list.

## Top rated

We will use the ratings, which we calculated before with the Bayesian rating technique.

In [48]:
def TopRated(df):    
    all_games = {}
    top_rated = []
    for i in range(df.shape[0]):
        all_games[i] = df.iloc[i]["rating"]
    top_rated = sorted(all_games.items(), key=lambda item: item[1], reverse=True)
    return top_rated

In [49]:
top_limit = 10
counter = 0
top_rated = TopRated(df_details)
for game, rate in top_rated:
    if counter < top_limit:
        if user_games.count(df_details.iloc[game]["name"])>0:
            continue
        counter+=1
        print(df_details.iloc[game]["name"],"===>",rate)

Portal 2 ===> 9.840933043545515
Factorio ===> 9.783406771972498
The Witcher 3 Wild Hunt ===> 9.753293880115061
Portal ===> 9.73441810716475
Counter-Strike ===> 9.713712651015893
The Binding of Isaac Rebirth ===> 9.710533564390223
RimWorld ===> 9.701293015664707
Terraria ===> 9.691867457316876
Mount & Blade Warband ===> 9.675976490175762
Hotline Miami ===> 9.675027796478476
