# Content-based filtering with cosine similarity and TFIDF

In [1]:
import pandas as pd 
import numpy as np
#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df_details=pd.read_csv('games_details_dataset.csv')

In [3]:
user_games = [2620, 2630] #example game ids

In [4]:
for i in range(df_details.shape[0]):
    if df_details.iloc[i]["appid"] in user_games:
        print(df_details.iloc[i]["name"])

Call of Duty
Call of Duty 2


In [5]:
def specific_genre(chosen_genre):
    remove_rows = []
    for i in range(df_details.shape[0]):
        if df_details.iloc[i]["genre_new"] != chosen_genre:
            remove_rows.append(i)
    df = df_details.drop(remove_rows, axis=0)
    user_likes = []
    for i in range(df.shape[0]):
        if user_games.count(df.iloc[i]["appid"])>0:
            user_likes.append(i)
    return df, user_likes

In [6]:
def specific_category(chosen_category):
    remove_rows = []
    for i in range(df_details.shape[0]):
        if df_details.iloc[i]["cat_new"] != chosen_category:
            remove_rows.append(i)
    df = df_details.drop(remove_rows, axis=0)
    user_likes = []
    for i in range(df.shape[0]):
        if user_games.count(df.iloc[i]["appid"])>0:
            user_likes.append(i)
    return df, user_likes

In [7]:
def positive_votes():
    remove_rows = []
    for i in range(df_details.shape[0]):
        if df_details.iloc[i]["rating_sign"] != "positive":
            remove_rows.append(i)
    df = df_details.drop(remove_rows, axis=0)
    user_likes = []
    for i in range(df.shape[0]):
        if user_games.count(df.iloc[i]["appid"])>0:
            user_likes.append(i)
    return df, user_likes

In [8]:
def more_playtime():
    remove_rows = []
    for i in range(df_details.shape[0]):
        if df_details.iloc[i]["playtime"] != "more":
            remove_rows.append(i)
    df = df_details.drop(remove_rows, axis=0)
    user_likes = []
    for i in range(df.shape[0]):
        if user_games.count(df.iloc[i]["appid"])>0:
            user_likes.append(i)
    return df, user_likes

In [9]:
df, user_likes = specific_genre("action")
#df, user_likes = specific_category("multiplayer")
#df, user_likes = positive_votes()
#df, user_likes = more_playtime()

In [10]:
for i in range(df.shape[0]):
    if i in user_likes:
        print(df.iloc[i]["name"])

Call of Duty
Call of Duty 2


## Cosine similarity

### 1st approach

We are using only the attributes: genre, categories and steam spy tags for cosine similarity.

In [11]:
def combined_features_one(row):
    return row['genres'].replace(";"," ").lower()+" "+row['categories'].replace(";"," ").lower()+" "+row['steamspy_tags'].replace(";"," ").lower()
df['cossim'] = df.apply(combined_features_one, axis =1)

In [12]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['cossim'])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [13]:
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         1.         0.79056942 ... 0.40414519 0.17928429 0.41833001]
 [1.         1.         0.79056942 ... 0.40414519 0.17928429 0.41833001]
 [0.79056942 0.79056942 1.         ... 0.27386128 0.09449112 0.28347335]
 ...
 [0.40414519 0.40414519 0.27386128 ... 1.         0.69006556 0.69006556]
 [0.17928429 0.17928429 0.09449112 ... 0.69006556 1.         0.71428571]
 [0.41833001 0.41833001 0.28347335 ... 0.69006556 0.71428571 1.        ]]


In [15]:
def find_similar_games(cosine_sim, user_likes, top_k):
    recommended_games = {}
    lower_limit_similarity = 0.95
    min_limit= 0.05
    # repeat until we find enough recommendations
    while len(recommended_games)<20:
        lower_limit_similarity-=min_limit
        recommended_games = {}
        for game in user_likes:
            for i in range(len(cosine_sim)):
                if game != i:
                    if cosine_sim[game][i] > lower_limit_similarity:
                        if not (i in user_likes):
                            if i in recommended_games.keys():
                                recommended_games[i][0] +=1
                                if cosine_sim[game][i] > recommended_games[i][1]:
                                    recommended_games[i][1] = cosine_sim[game][i]
                            else:
                                recommended_games[i] = [1, cosine_sim[game][i]]
    recommended_games = {k: v for k, v in sorted(recommended_games.items(), key=lambda item: item[1][0], reverse=True)}
    sort_dict = {}
    lists = []
    list_counter = 0
    frequency = len(cosine_sim) #infinity
    for game, sim_list in recommended_games.items():
        #first list of frequencies
        if frequency == len(cosine_sim):
            frequency = sim_list[0]
            for i in range(frequency):
                lists.append([])
        #sort the list
        if frequency > sim_list[0]:
            frequency = sim_list[0]
            lists[list_counter] = sorted(sort_dict.items(), key=lambda item: item[1][1], reverse=True)
            list_counter += 1
            sort_dict.clear()
        sort_dict[game] = sim_list
    lists[list_counter] = sorted(sort_dict.items(), key=lambda item: item[1][1], reverse=True)
    list_counter += 1
    final_sorted_list = []
    for i in range(list_counter):
        final_sorted_list+=lists[i]
    recommended_games = {k: v for k, v in final_sorted_list}
    counter = 0
    result_list = []
    for game, info in recommended_games.items():
        if counter < top_k:
            counter+=1
            result_list.append(df.iloc[game]["appid"])
    return result_list

In [16]:
recommended_games = find_similar_games(cosine_sim, user_likes, 10)
for game in recommended_games:
    for i in range(df.shape[0]):
        if df.iloc[i]["appid"] == game:
            print(df.iloc[i]["name"])
            break

Call of Duty United Offensive
Medal of Honor Airborne
Delta Force Xtreme 2
Delta Force Task Force Dagger
Delta Force Xtreme
Painkiller Resurrection
Enemy Front
Brothers in Arms Road to Hill 30
Brothers in Arms Hell's Highway
Brothers in Arms Earned in Blood


### 2nd approach

We will add the attributes publisher and developer and will do the same process .

In [17]:
def combined_features_two(row):
    return row['genres'].replace(";"," ").lower()+" "+row['categories'].replace(";"," ").lower()+" "+row['steamspy_tags'].replace(";"," ").lower()+" "+row['developer'].replace(";"," ").lower()+" "+str(row['publisher']).replace(";"," ").lower()
df['cossim'] = df.apply(combined_features_two, axis =1)

In [18]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['cossim'])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [19]:
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         1.         0.82495791 ... 0.32780503 0.14433757 0.35721725]
 [1.         1.         0.82495791 ... 0.32780503 0.14433757 0.35721725]
 [0.82495791 0.82495791 1.         ... 0.19867985 0.06804138 0.21650635]
 ...
 [0.32780503 0.32780503 0.19867985 ... 1.         0.54073807 0.57353933]
 [0.14433757 0.14433757 0.06804138 ... 0.54073807 1.         0.58925565]
 [0.35721725 0.35721725 0.21650635 ... 0.57353933 0.58925565 1.        ]]


In [20]:
recommended_games = find_similar_games(cosine_sim, user_likes, 10)
for game in recommended_games:
    for i in range(df.shape[0]):
        if df.iloc[i]["appid"] == game:
            print(df.iloc[i]["name"])
            break

Call of Duty 4 Modern Warfare
Call of Duty United Offensive
TimeShift
Medal of Honor Airborne
Call of Duty Modern Warfare 2
Singularity
Painkiller Resurrection
Painkiller Overdose
Project Snowblind
Call of Duty World at War


### 3rd approach

We will add the keywords from the descriptions of the games

In [21]:
def combined_features_three(row):
    return row['genres'].replace(";"," ").lower()+" "+row['categories'].replace(";"," ").lower()+" "+row['steamspy_tags'].replace(";"," ").lower()+" "+row['developer'].replace(";"," ").lower()+" "+str(row['publisher']).replace(";"," ").lower()+" "+' '.join(map(str, row['desc_key_new'])).lower()
df['cossim'] = df.apply(combined_features_three, axis =1)

In [22]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['cossim'])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [23]:
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         1.         0.82495791 ... 0.32780503 0.14433757 0.35721725]
 [1.         1.         0.82495791 ... 0.32780503 0.14433757 0.35721725]
 [0.82495791 0.82495791 1.         ... 0.19867985 0.06804138 0.21650635]
 ...
 [0.32780503 0.32780503 0.19867985 ... 1.         0.54073807 0.57353933]
 [0.14433757 0.14433757 0.06804138 ... 0.54073807 1.         0.58925565]
 [0.35721725 0.35721725 0.21650635 ... 0.57353933 0.58925565 1.        ]]


In [24]:
recommended_games = find_similar_games(cosine_sim, user_likes, 10)
for game in recommended_games:
    for i in range(df.shape[0]):
        if df.iloc[i]["appid"] == game:
            print(df.iloc[i]["name"])
            break

Call of Duty 4 Modern Warfare
Call of Duty United Offensive
TimeShift
Medal of Honor Airborne
Call of Duty Modern Warfare 2
Singularity
Painkiller Resurrection
Painkiller Overdose
Project Snowblind
Call of Duty World at War


### 4th approach

We will use the descriptions of every game (after cleaning) instead of their keywords.

In [25]:
def combined_features_four(row):
    return row['genres'].replace(";"," ").lower()+" "+row['categories'].replace(";"," ").lower()+" "+row['steamspy_tags'].replace(";"," ").lower()+" "+row['developer'].replace(";"," ").lower()+" "+str(row['publisher']).replace(";"," ").lower()+" "+row['new_description'].lower()
df['cossim'] = df.apply(combined_features_four, axis =1)

In [26]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['cossim'])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [27]:
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         0.50653623 0.32134229 ... 0.16062314 0.11470787 0.21526419]
 [0.50653623 1.         0.30050125 ... 0.1585505  0.09058216 0.21248647]
 [0.32134229 0.30050125 1.         ... 0.06193801 0.02948839 0.09223132]
 ...
 [0.16062314 0.1585505  0.06193801 ... 1.         0.28005602 0.29197858]
 [0.11470787 0.09058216 0.02948839 ... 0.28005602 1.         0.41702883]
 [0.21526419 0.21248647 0.09223132 ... 0.29197858 0.41702883 1.        ]]


In [28]:
recommended_games = find_similar_games(cosine_sim, user_likes, 10)
for game in recommended_games:
    for i in range(df.shape[0]):
        if df.iloc[i]["appid"] == game:
            print(df.iloc[i]["name"])
            break

Call of Duty 4 Modern Warfare
Call of Duty United Offensive
Call of Duty World at War
Call of Duty Black Ops
Call of Duty Modern Warfare 2
Call of Duty WWII
Call of Duty Advanced Warfare - Gold Edition
Project Snowblind
Call of Duty Black Ops - Mac Edition
Call of Duty Black Ops III


The last two approaches has the best results. We put as input 2 call of duty games and we took as a result a list containing call of duty games at the first places of the list.

## Top rated

We will use the ratings, which we calculated before with the Bayesian rating technique.

In [48]:
def TopRated(df):    
    all_games = {}
    top_rated = []
    for i in range(df.shape[0]):
        all_games[i] = df.iloc[i]["rating"]
    top_rated = sorted(all_games.items(), key=lambda item: item[1], reverse=True)
    return top_rated

In [49]:
top_limit = 10
counter = 0
top_rated = TopRated(df_details)
for game, rate in top_rated:
    if counter < top_limit:
        if user_games.count(df_details.iloc[game]["name"])>0:
            continue
        counter+=1
        print(df_details.iloc[game]["name"],"===>",rate)

Portal 2 ===> 9.840933043545515
Factorio ===> 9.783406771972498
The Witcher 3 Wild Hunt ===> 9.753293880115061
Portal ===> 9.73441810716475
Counter-Strike ===> 9.713712651015893
The Binding of Isaac Rebirth ===> 9.710533564390223
RimWorld ===> 9.701293015664707
Terraria ===> 9.691867457316876
Mount & Blade Warband ===> 9.675976490175762
Hotline Miami ===> 9.675027796478476
