In [1]:
#https://medium.com/@prateekgaurav/step-by-step-content-based-recommendation-system-823bbfd0541c
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api

In [2]:
# Import the games dataset
df = pd.read_csv('games.csv')

# Combine movie name and tags into a single string
df['tags'] = df['Developers'] + ' ' + df['Publishers'] + ' ' + df['Categories'] + ' ' + df['Genres'] + ' ' + df['Tags'] + ' ' + str(df['Price'])
df['tags'] = df['tags'].fillna('')

# Tokenize content for Word2Vec
df['tokens'] = df['tags'].apply(simple_preprocess)

In [3]:
# Load the pre-trained Word2Vec model (Google News)
model = api.load('word2vec-google-news-300')

In [4]:
# Function to average word vectors for a text
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

In [5]:
# Function to compute average word vectors for all movies
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.index_to_key)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]
    return np.array(features)



In [7]:
'''
# Compute average word vectors for all movies
w2v_feature_array = averaged_word_vectorizer(corpus=df['tokens'], model=model, num_features=300)

# Get the user input
user_game = input("Enter a game title: ")

# Find the index of the user movie
game_index = df[df['Name'] == user_game].index[0]

# Compute the cosine similarities between the user game and all other games
user_game_vector = w2v_feature_array[game_index].reshape(1, -1)
similarity_scores = cosine_similarity(user_game_vector, w2v_feature_array)

# Get the top 10 most similar games
similar_games = list(enumerate(similarity_scores[0]))
sorted_similar_games = sorted(similar_games, key=lambda x: x[1], reverse=True)[1:20]

# Print the top 10 similar movies
for i, score in sorted_similar_games:
    print("{}: {}".format(i, df.loc[i, 'Name']))
'''

Enter a game title:  QUAKE


54: DOOM II
52: Ultimate Doom
884: Serious Sam HD: The Second Encounter
883: Serious Sam HD: The First Encounter
887: Serious Sam 3: BFE
16838: Jump Gunners
1385: Serious Sam 2
2705: Serious Sam's Bogus Detour
37229: MIBT
40301: Samurai Gunn 2
3058: 8-Bit Commando
13796: Blazing Chrome
14621: Disputed Space
10878: HYPERCHARGE: Unboxed
20930: Ninjin: Clash of Carrots
4286: Phantom Breaker: Battle Grounds
10941: Guns, Gore and Cannoli 2
7356: Modbox
35689: COSMOS


In [24]:
def get_recommendations(df, wfa, games):
    input_games = [game.strip() for game in games.split('@@@')]

    recs = set()
    g_set = set()
    
    for g in input_games:
        # Find the index of the user movie
        try:
            game_index = df[df['Name'] == g].index[0]

            g_set.add(g)
            
            user_game_vector = wfa[game_index].reshape(1, -1)
            similarity_scores = cosine_similarity(user_game_vector, wfa)
            
            # Get the top 6 most similar games
            similar_games = list(enumerate(similarity_scores[0]))
            sorted_similar_games = sorted(similar_games, key=lambda x: x[1], reverse=True)[1:12]
            # Print the top 10 similar movies
            for i, score in sorted_similar_games:
                recs.add(df['Name'].iloc[g])
        except:
            pass
    return recs, g_set

In [25]:
def evaluate(recs, user_games, num_games):
    tp = 0
    fp = 0
    for rec in recs:
        if rec in user_games:
            tp += 1
        else:
            fp += 1
    fn = 0
    for ug in user_games:
        if ug not in recs:
            fn += 1
    tn = num_games - tp - fp - fn
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    try:
        f = (2 * p * r)/(p + r)
    except:
        f = 0
    a = (tp + tn) / (tp + fn + tn + fp)
    return p, r, f, a

In [22]:
# Compute average word vectors for all movies
w2v_feature_array = averaged_word_vectorizer(corpus=df['tokens'], model=model, num_features=300)
udf = pd.read_csv("user_games.csv")
udf.columns = ["Id", "Games"]

e_lst = []

for i, row in udf.iterrows():
    r = get_recommendations(df, w2v_feature_array, row['Games'])
    if len(r[0]) != 0:
        e = evaluate(r[0], r[1], len(df))
        e_lst.append(e)

print(e_lst)

[(0.035398230088495575, 0.3076923076923077, 0.06349206349206349, 0.998220586904726), (0.0, 0.0, 0, 0.9980999487287752), (0.02465166130760986, 0.23711340206185566, 0.04466019417475728, 0.985161504358054), (0.0, 0.0, 0, 0.9998190427360738), (0.016129032258064516, 0.16666666666666666, 0.02941176470588235, 0.999004735048406), (0.007936507936507936, 0.07692307692307693, 0.014388489208633093, 0.9979340712368429), (0.0, 0.0, 0, 0.9992761709442953), (0.07407407407407407, 0.6666666666666666, 0.13333333333333333, 0.99960792592816), (0.0, 0.0, 0, 0.9990650541363815), (0.02040816326530612, 0.2, 0.03703703703703703, 0.9929426667068794), (0.030623020063357972, 0.27884615384615385, 0.05518553758325404, 0.9850257864101095), (0.05405405405405406, 0.4444444444444444, 0.0963855421686747, 0.9988690171004614), (0.0, 0.0, 0, 0.9990952136803691), (0.046153846153846156, 0.42857142857142855, 0.08333333333333333, 0.999004735048406), (0.0, 0.0, 0, 0.9994722079802153), (0.0, 0.0, 0, 0.9998190427360738), (0.0, 0.0

In [23]:
p = 0
r = 0
f = 0
a = 0
for prfa in e_lst:
    p += prfa[0]
    r += prfa[1]
    f += prfa[2]
    a += prfa[3]
p = p/len(e_lst)
r = r/len(e_lst)
f = f/len(e_lst)
a = a/len(e_lst)

print("Avg precision: {}".format(p))
print("Avg recall: {}".format(r))
print("Avg f1: {}".format(f))
print("Avg accuracy: {}".format(a))

Avg precision: 0.024895063697270563
Avg recall: 0.22961900960193307
Avg f1: 0.044882614691594894
Avg accuracy: 0.9983197919022057
