# Content-based filtering with cosine similarity and TFIDF

In [1]:
import pandas as pd 
import numpy as np
#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_details=pd.read_csv('games_details_dataset.csv')

In [3]:
user_games = [2620, 2630] #example game ids

In [4]:
for i in range(df_details.shape[0]):
    if df_details.iloc[i]["appid"] in user_games:
        print(df_details.iloc[i]["name"])

Call of Duty
Call of Duty 2


In [5]:
def specific_genre(chosen_genre):
    remove_rows = []
    for i in range(df_details.shape[0]):
        if df_details.iloc[i]["genre_new"] != chosen_genre:
            remove_rows.append(i)
    df = df_details.drop(remove_rows, axis=0)
    user_likes = []
    for i in range(df.shape[0]):
        if user_games.count(df.iloc[i]["appid"])>0:
            user_likes.append(i)
    return df, user_likes

In [6]:
def specific_category(chosen_category):
    remove_rows = []
    for i in range(df_details.shape[0]):
        if df_details.iloc[i]["cat_new"] != chosen_category:
            remove_rows.append(i)
    df = df_details.drop(remove_rows, axis=0)
    user_likes = []
    for i in range(df.shape[0]):
        if user_games.count(df.iloc[i]["appid"])>0:
            user_likes.append(i)
    return df, user_likes

In [7]:
def positive_votes():
    remove_rows = []
    for i in range(df_details.shape[0]):
        if df_details.iloc[i]["rating_sign"] != "positive":
            remove_rows.append(i)
    df = df_details.drop(remove_rows, axis=0)
    user_likes = []
    for i in range(df.shape[0]):
        if user_games.count(df.iloc[i]["appid"])>0:
            user_likes.append(i)
    return df, user_likes

In [8]:
def more_playtime():
    remove_rows = []
    for i in range(df_details.shape[0]):
        if df_details.iloc[i]["playtime"] != "more":
            remove_rows.append(i)
    df = df_details.drop(remove_rows, axis=0)
    user_likes = []
    for i in range(df.shape[0]):
        if user_games.count(df.iloc[i]["appid"])>0:
            user_likes.append(i)
    return df, user_likes

In [9]:
df, user_likes = specific_genre("action")
#df, user_likes = specific_category("multiplayer")
#df, user_likes = positive_votes()
#df, user_likes = more_playtime()

In [10]:
for i in range(df.shape[0]):
    if i in user_likes:
        print(df.iloc[i]["name"])

Call of Duty
Call of Duty 2


## Cosine similarity

### 1st approach

We are using only the attributes: genre, categories and steam spy tags for cosine similarity.

In [11]:
def combined_features_one(row):
    return row['genres'].replace(";"," ").lower()+" "+row['categories'].replace(";"," ").lower()+" "+row['steamspy_tags'].replace(";"," ").lower()
df['cossim'] = df.apply(combined_features_one, axis =1)

In [12]:
cv = TfidfVectorizer()
count_matrix = cv.fit_transform(df['cossim'])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.4372291 0.       ]
 ...
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]]


In [13]:
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         1.         0.80746968 ... 0.11857947 0.02934339 0.16650666]
 [1.         1.         0.80746968 ... 0.11857947 0.02934339 0.16650666]
 [0.80746968 0.80746968 1.         ... 0.05647515 0.0101408  0.07930116]
 ...
 [0.11857947 0.11857947 0.05647515 ... 1.         0.76354708 0.40571767]
 [0.02934339 0.02934339 0.0101408  ... 0.76354708 1.         0.52076361]
 [0.16650666 0.16650666 0.07930116 ... 0.40571767 0.52076361 1.        ]]


In [14]:
len(cosine_sim[0]) #width

10140

In [15]:
len(cosine_sim) #height

10140

In [16]:
def find_similar_games(cosine_sim, user_likes, top_k):
    recommended_games = {}
    lower_limit_similarity = 0.95
    min_limit= 0.05
    # repeat until we find enough recommendations
    while len(recommended_games)<20:
        lower_limit_similarity-=min_limit
        recommended_games = {}
        for game in user_likes:
            for i in range(len(cosine_sim)):
                if game != i:
                    if cosine_sim[game][i] > lower_limit_similarity:
                        if not (i in user_likes):
                            if i in recommended_games.keys():
                                recommended_games[i][0] +=1
                                if cosine_sim[game][i] > recommended_games[i][1]:
                                    recommended_games[i][1] = cosine_sim[game][i]
                            else:
                                recommended_games[i] = [1, cosine_sim[game][i]]
    recommended_games = {k: v for k, v in sorted(recommended_games.items(), key=lambda item: item[1][0], reverse=True)}
    sort_dict = {}
    lists = []
    list_counter = 0
    frequency = len(cosine_sim) #infinity
    for game, sim_list in recommended_games.items():
        #first list of frequencies
        if frequency == len(cosine_sim):
            frequency = sim_list[0]
            for i in range(frequency):
                lists.append([])
        #sort the list
        if frequency > sim_list[0]:
            frequency = sim_list[0]
            lists[list_counter] = sorted(sort_dict.items(), key=lambda item: item[1][1], reverse=True)
            list_counter += 1
            sort_dict.clear()
        sort_dict[game] = sim_list
    lists[list_counter] = sorted(sort_dict.items(), key=lambda item: item[1][1], reverse=True)
    list_counter += 1
    final_sorted_list = []
    for i in range(list_counter):
        final_sorted_list+=lists[i]
    recommended_games = {k: v for k, v in final_sorted_list}
    counter = 0
    result_list = []
    for game, info in recommended_games.items():
        if counter < top_k:
            counter+=1
            result_list.append(df.iloc[game]["appid"])
    return result_list

In [19]:
recommended_games = find_similar_games(cosine_sim, user_likes, 10)
for game in recommended_games:
    for i in range(df.shape[0]):
        if df.iloc[i]["appid"] == game:
            print(df.iloc[i]["name"])
            break

Call of Duty United Offensive
Medal of Honor Airborne
Brothers in Arms Road to Hill 30
Brothers in Arms Hell's Highway
Brothers in Arms Earned in Blood
Ubersoldier II
World War II GI
Enemy Front
Wolfenstein The Old Blood
Delta Force Xtreme 2


### 2nd approach

We will add the attributes publisher and developer and will do the same process .

In [20]:
def combined_features_two(row):
    return row['genres'].replace(";"," ").lower()+" "+row['categories'].replace(";"," ").lower()+" "+row['steamspy_tags'].replace(";"," ").lower()+" "+row['developer'].replace(";"," ").lower()+" "+str(row['publisher']).replace(";"," ").lower()
df['cossim'] = df.apply(combined_features_two, axis =1)

In [21]:
cv = TfidfVectorizer()
count_matrix = cv.fit_transform(df['cossim'])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [22]:
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         1.         0.91155011 ... 0.03386993 0.00805951 0.047133  ]
 [1.         1.         0.91155011 ... 0.03386993 0.00805951 0.047133  ]
 [0.91155011 0.91155011 1.         ... 0.01581875 0.00273137 0.02201318]
 ...
 [0.03386993 0.03386993 0.01581875 ... 1.         0.12809907 0.07015027]
 [0.00805951 0.00805951 0.00273137 ... 0.12809907 1.         0.08658434]
 [0.047133   0.047133   0.02201318 ... 0.07015027 0.08658434 1.        ]]


In [23]:
recommended_games = find_similar_games(cosine_sim, user_likes, 10)
for game in recommended_games:
    for i in range(df.shape[0]):
        if df.iloc[i]["appid"] == game:
            print(df.iloc[i]["name"])
            break

Call of Duty 4 Modern Warfare
Call of Duty Modern Warfare 2
Call of Duty Black Ops III
Homeworld Remastered Collection
Sid Meier's Civilization Beyond Earth
Sid Meier's Civilization V
BioShock Infinite
Call of Duty Infinite Warfare
STAR WARS Jedi Knight II - Jedi Outcast
Call of Duty Ghosts


### 3rd approach

We will add the keywords from the descriptions of the games

In [24]:
def combined_features_three(row):
    return row['genres'].replace(";"," ").lower()+" "+row['categories'].replace(";"," ").lower()+" "+row['steamspy_tags'].replace(";"," ").lower()+" "+row['developer'].replace(";"," ").lower()+" "+str(row['publisher']).replace(";"," ").lower()+" "+' '.join(map(str, row['desc_key_new'])).lower()
df['cossim'] = df.apply(combined_features_three, axis =1)

In [25]:
cv = TfidfVectorizer()
count_matrix = cv.fit_transform(df['cossim'])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [26]:
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         1.         0.91155011 ... 0.03386993 0.00805951 0.047133  ]
 [1.         1.         0.91155011 ... 0.03386993 0.00805951 0.047133  ]
 [0.91155011 0.91155011 1.         ... 0.01581875 0.00273137 0.02201318]
 ...
 [0.03386993 0.03386993 0.01581875 ... 1.         0.12809907 0.07015027]
 [0.00805951 0.00805951 0.00273137 ... 0.12809907 1.         0.08658434]
 [0.047133   0.047133   0.02201318 ... 0.07015027 0.08658434 1.        ]]


In [27]:
recommended_games = find_similar_games(cosine_sim, user_likes, 10)
for game in recommended_games:
    for i in range(df.shape[0]):
        if df.iloc[i]["appid"] == game:
            print(df.iloc[i]["name"])
            break

Call of Duty 4 Modern Warfare
Call of Duty Modern Warfare 2
Call of Duty Black Ops III
Homeworld Remastered Collection
Sid Meier's Civilization Beyond Earth
Sid Meier's Civilization V
BioShock Infinite
Call of Duty Infinite Warfare
STAR WARS Jedi Knight II - Jedi Outcast
Call of Duty Ghosts


### 4th approach

We will use the descriptions of every game (after cleaning) instead of their keywords.

In [28]:
def combined_features_four(row):
    return row['genres'].replace(";"," ").lower()+" "+row['categories'].replace(";"," ").lower()+" "+row['steamspy_tags'].replace(";"," ").lower()+" "+row['developer'].replace(";"," ").lower()+" "+str(row['publisher']).replace(";"," ").lower()+" "+row['new_description'].lower()
df['cossim'] = df.apply(combined_features_four, axis =1)

In [29]:
cv = TfidfVectorizer()
count_matrix = cv.fit_transform(df['cossim'])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [30]:
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.00000000e+00 3.01774499e-01 2.12337643e-01 ... 2.64396496e-02
  9.44829028e-03 1.87010001e-02]
 [3.01774499e-01 1.00000000e+00 2.12126697e-01 ... 1.95972828e-02
  6.44721905e-03 2.02526286e-02]
 [2.12337643e-01 2.12126697e-01 1.00000000e+00 ... 3.51463346e-03
  8.90117760e-04 6.77150100e-03]
 ...
 [2.64396496e-02 1.95972828e-02 3.51463346e-03 ... 1.00000000e+00
  5.00852952e-02 2.59888151e-02]
 [9.44829028e-03 6.44721905e-03 8.90117760e-04 ... 5.00852952e-02
  1.00000000e+00 4.60722771e-02]
 [1.87010001e-02 2.02526286e-02 6.77150100e-03 ... 2.59888151e-02
  4.60722771e-02 1.00000000e+00]]


In [31]:
recommended_games = find_similar_games(cosine_sim, user_likes, 10)
for game in recommended_games:
    for i in range(df.shape[0]):
        if df.iloc[i]["appid"] == game:
            print(df.iloc[i]["name"])
            break

Call of Duty 4 Modern Warfare
Call of Duty World at War
Call of Duty WWII
Call of Duty United Offensive
Call of Duty Black Ops
Call of Duty Ghosts
Call of Duty Black Ops - Mac Edition
Call of Duty Black Ops III
Call of Duty Modern Warfare 2
Sid Meier's Civilization V


The last two approaches has the best results. We put as input 2 call of duty games and we took as a result a list containing call of duty games at the first places of the list.

## Top rated

We will use the ratings, which we calculated before with the Bayesian rating technique.

In [48]:
def TopRated(df):    
    all_games = {}
    top_rated = []
    for i in range(df.shape[0]):
        all_games[i] = df.iloc[i]["rating"]
    top_rated = sorted(all_games.items(), key=lambda item: item[1], reverse=True)
    return top_rated

In [49]:
top_limit = 10
counter = 0
top_rated = TopRated(df_details)
for game, rate in top_rated:
    if counter < top_limit:
        if user_games.count(df_details.iloc[game]["name"])>0:
            continue
        counter+=1
        print(df_details.iloc[game]["name"],"===>",rate)

Portal 2 ===> 9.840933043545515
Factorio ===> 9.783406771972498
The Witcher 3 Wild Hunt ===> 9.753293880115061
Portal ===> 9.73441810716475
Counter-Strike ===> 9.713712651015893
The Binding of Isaac Rebirth ===> 9.710533564390223
RimWorld ===> 9.701293015664707
Terraria ===> 9.691867457316876
Mount & Blade Warband ===> 9.675976490175762
Hotline Miami ===> 9.675027796478476
