In [25]:
import pandas as pd 
import numpy as np 
df=pd.read_csv('Video_Games_Sales_as_at_22_Dec_2016.csv')


In [26]:
df.head(5)

Unnamed: 0,Game_id,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,Developer,Rating_Score,Rating_Count
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,Nintendo,8.0,322.0
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,3.4,
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,Nintendo,8.3,709.0
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,Nintendo,8.0,192.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,3.4,


In [27]:
## **Demographic Filtering** -
   #Đầu tiên ta cần xếp hạng các game
 #tính score mỗi game
 #đưa ra bảng các game tốt nhất

# Công thức:
  #(v/(v+m) * R) + (m/(m+v) * C)
# v số lượt rate
# m số lượt rate tối thiểu để có trên bảng xếp hạng
# R xếp hạng trung bình của game
# C rating score

#chúng ta có v và R cần tính c
C= df['Rating_Score'].mean()
C

6.136104915006595

In [28]:
#để có thể có trên bảng xếp hạng game đó phải có lượt rate >90%
m= df['Rating_Count'].quantile(0.9)
m

315.10000000000036

In [29]:
#lọc các game đủ diều kiện cho rank
game_rank = df.copy().loc[df['Rating_Count'] >= m]
game_rank.shape

(759, 16)

In [30]:
def weighted_rating(x, m=m, C=C):
    v = x['Rating_Count']
    R = x['Rating_Score']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [31]:
#tạo cột score
game_rank['score'] = game_rank.apply(weighted_rating, axis=1)

In [32]:
#Sort
game_rank = game_rank.sort_values('score', ascending=False)

#Print the top 15 rank
game_rank[['Name', 'Rating_Count', 'Rating_Score', 'score']].head(10)

Unnamed: 0,Name,Rating_Count,Rating_Score,score
2851,The Witcher 3: Wild Hunt,10665.0,9.3,9.209205
303,The Witcher 3: Wild Hunt,10179.0,9.2,9.108002
665,Half-Life 2,8665.0,9.1,8.996001
149,The Last of Us,8003.0,9.1,8.987724
1499,The Witcher 3: Wild Hunt,3963.0,9.2,8.974331
15293,Left 4 Dead,3717.0,9.2,8.960563
10106,Cory in the House,1273.0,9.5,8.832559
284,Half-Life,3161.0,9.1,8.83133
11288,Counter-Strike: Source,9851.0,8.9,8.814333
9143,The Orange Box,1495.0,9.3,8.749233


In [33]:
#Construct a reverse map of indices and game name
indices = pd.Series(df.index, index=df['Name']).drop_duplicates()


In [34]:
indices

Name
Wii Sports                           0
Super Mario Bros.                    1
Mario Kart Wii                       2
Wii Sports Resort                    3
Pokemon Red/Pokemon Blue             4
                                 ...  
Samurai Warriors: Sanada Maru    16714
LMA Manager 2007                 16715
Haitaka no Psychedelica          16716
Spirits & Spells                 16717
Winning Post 8 2016              16718
Length: 16719, dtype: int64

In [35]:
#hàm đưa ra game đề xuất
def get_recommendations(name, cosine_sim):
    idx = indices[name] # Get the index of the game
    sim_scores = list(enumerate(cosine_sim[idx])) # lấy điểm tương đồng giữa các game
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)# sắp xếp các game bằng điểm tương đồng
    sim_scores = sim_scores[1:11]#lấy score của 10 game tương đồng
    movie_indices = [i[0] for i in sim_scores]# lấy indices của game
    # trả về tên 
    return df['Name'].iloc[movie_indices]

In [36]:
#recomend dựa trên 'Genre', 'Publisher', 'Platform', 'Developer'
df[['Genre', 'Publisher', 'Platform', 'Developer']].head(3)

Unnamed: 0,Genre,Publisher,Platform,Developer
0,Sports,Nintendo,Wii,Nintendo
1,Platform,Nintendo,NES,
2,Racing,Nintendo,Wii,Nintendo


In [37]:

def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [38]:
# làm sạch data
features = ['Genre', 'Publisher', 'Platform', 'Developer']

for feature in features:
    df[feature] = df[feature].apply(clean_data)
  

In [39]:
df[['Genre', 'Publisher', 'Platform', 'Developer']].head(3)

Unnamed: 0,Genre,Publisher,Platform,Developer
0,sports,nintendo,wii,nintendo
1,platform,nintendo,nes,
2,racing,nintendo,wii,nintendo


In [40]:
#tạo 1 chuỗi chứa dữ liệu của 'Genre', 'Publisher', 'Platform', 'Developer'
def create_soup(x):
    return ' '.join(x['Genre']) + ' ' + ' '.join(x['Publisher']) + ' ' + x['Platform'] + ' ' + ' '.join(x['Developer'])
df['soup'] = df.apply(create_soup, axis=1)

In [41]:
# Import CountVectorizer and tạo count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [42]:
# tính Cosine Similarity matrix 
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [43]:
df = df.reset_index()
indices = pd.Series(df.index, index=df['Name'])

In [47]:
a=get_recommendations('Wii Sports Resort', cosine_sim)

In [48]:
a

2                Mario Kart Wii
3             Wii Sports Resort
7                      Wii Play
8     New Super Mario Bros. Wii
13                      Wii Fit
15                 Wii Fit Plus
39      Super Smash Bros. Brawl
49           Super Mario Galaxy
61                 Just Dance 3
68                 Just Dance 2
Name: Name, dtype: object

In [49]:
b=get_recommendations('LMA Manager 2007', cosine_sim)


In [50]:
b

23                Grand Theft Auto V
29    Call of Duty: Modern Warfare 3
32           Call of Duty: Black Ops
35        Call of Duty: Black Ops II
36    Call of Duty: Modern Warfare 2
44                            Halo 3
51               Grand Theft Auto IV
60              Call of Duty: Ghosts
64                       Halo: Reach
66                            Halo 4
Name: Name, dtype: object