In [1]:
from bson.son import SON
from pymongo import MongoClient
import re
import sys

import pandas as pd
import numpy as np

In [2]:
sys.path.insert(0, '../scripts')

In [3]:
import uniformer

In [4]:
client = MongoClient('localhost', 27017)

In [5]:
db = client.twitch_60

In [6]:
games_coll = db.games

## Top N games on twitch (collected between 28/05 and 30/05)

Just to check: how many times data from twitch has been collected between 28 and 30 May. We gather all the timestamps

In [7]:
pipeline = [{ "$match" : {'$and' : [
                {'timestamp' : {'$gt' : '2019-05-28'}},
                {'timestamp' : {'$lt' : '2019-05-30'}}
            ]}}, {"$project":
             {'_id': 0,
              'time':'$timestamp'} }]

In [8]:
tsmps = list(db.games.aggregate(pipeline))

In [9]:
tsmps[:10]

[{'time': '2019-05-28 00:01:08.972371'},
 {'time': '2019-05-28 00:04:08.980665'},
 {'time': '2019-05-28 00:07:08.973388'},
 {'time': '2019-05-28 00:10:08.981706'},
 {'time': '2019-05-28 00:13:08.974376'},
 {'time': '2019-05-28 00:16:08.978220'},
 {'time': '2019-05-28 00:19:08.970934'},
 {'time': '2019-05-28 00:22:08.979262'},
 {'time': '2019-05-28 00:25:08.971894'},
 {'time': '2019-05-28 00:28:08.980090'}]

In [10]:
tsmps[-10:]

[{'time': '2019-05-29 23:31:08.967758'},
 {'time': '2019-05-29 23:34:08.980275'},
 {'time': '2019-05-29 23:37:08.973006'},
 {'time': '2019-05-29 23:40:08.981344'},
 {'time': '2019-05-29 23:43:08.974079'},
 {'time': '2019-05-29 23:46:08.982401'},
 {'time': '2019-05-29 23:49:08.966718'},
 {'time': '2019-05-29 23:52:08.975052'},
 {'time': '2019-05-29 23:55:08.967763'},
 {'time': '2019-05-29 23:58:08.976069'}]

In [11]:
len(tsmps)/ 48

20.0

Data has been collected every 3 minutes, so 20 times each hour is correct.

In [12]:
pipeline = [{"$unwind": "$data"},
            { "$match" : {'$and' : [
                {'timestamp' : {'$gt' : '2019-05-28'}},
                {'timestamp' : {'$lt' : '2019-05-30'}}
            ]}},
            { "$group": {"_id": {"name": "$data.game.name",
                                 "norm_name" : "$data.game.norm_name",
                                 "unary_name" : "$data.game.unary_name",
                                 "twitch_id" : "$data.game._id"},
                         "avgViewers": { "$avg": '$data.viewers'}}},
           {"$sort": SON([("avgViewers", -1), ("_id", -1)])}]

In [13]:
top_twitch = list(db.games.aggregate(pipeline))

In [14]:
top_twitch[0:3]

[{'_id': {'name': 'Fortnite',
   'norm_name': 'fortnite',
   'unary_name': 'fortnite',
   'twitch_id': 33214},
  'avgViewers': 126368.478125},
 {'_id': {'name': 'League of Legends',
   'norm_name': 'league of legends',
   'unary_name': 'league of legends',
   'twitch_id': 21779},
  'avgViewers': 117746.63333333333},
 {'_id': {'name': 'Grand Theft Auto V',
   'norm_name': 'grand theft auto v',
   'unary_name': 'grand theft auto v',
   'twitch_id': 32982},
  'avgViewers': 94222.24895833334}]

In [15]:
top_twitch = [
    {'name' : elem['_id']['name'],
     'norm_name' : elem['_id']['norm_name'],
     'unary_name' : elem['_id']['unary_name'],
     'twitch_id' : elem['_id']['twitch_id'],
     'avgViewers' : elem['avgViewers']
    } for elem in top_twitch
]

In [16]:
top_twitch[0:3]

[{'name': 'Fortnite',
  'norm_name': 'fortnite',
  'unary_name': 'fortnite',
  'twitch_id': 33214,
  'avgViewers': 126368.478125},
 {'name': 'League of Legends',
  'norm_name': 'league of legends',
  'unary_name': 'league of legends',
  'twitch_id': 21779,
  'avgViewers': 117746.63333333333},
 {'name': 'Grand Theft Auto V',
  'norm_name': 'grand theft auto v',
  'unary_name': 'grand theft auto v',
  'twitch_id': 32982,
  'avgViewers': 94222.24895833334}]

In [17]:
top_twitch_df = pd.DataFrame(top_twitch)

# Top N games on Vgchartz from Mongo DB

It is necessary to execute the Notebook "Save Top N games on MongoDB" before executing the next code chunks

In [5]:
db_vg = client.vgchartz
top_games_vg = db_vg.top_games

Quick check for seeing if everything was loaded correctly

In [6]:
top_games = top_games_vg.find_one()
top_games
    

{'_id': ObjectId('5d00d5385024ccaf12d8137e'),
 'Rank': 1,
 'Name': 'Wii Sports',
 'Platform': 'Wii',
 'Year': 2006.0,
 'href_Genre': 'http://www.vgchartz.com/game/2667/wii-sports/?region=All',
 'Publisher': 'Nintendo  ',
 'Developer': 'Nintendo EAD  ',
 'Critic_Score': 7.7,
 'User_Score': nan,
 'NA_Sales': 41.36,
 'PAL_Sales': 29.02,
 'JP_Sales': 3.77,
 'Other_Sales': 8.51,
 'Global_Sales': 82.65}

Vgchartz returns the global sales data for each game and for each Platform. Therefore the game "Grand Theft Auto V" will appear several times. Let's find out if this is correct.

In [7]:
gtaV_occurences = top_games_vg.find({"Name": "Grand Theft Auto V"})
for game in gtaV_occurences:
    print(game)

{'_id': ObjectId('5d00d5385024ccaf12d8138f'), 'Rank': 20, 'Name': 'Grand Theft Auto V', 'Platform': 'PS3', 'Year': 2013.0, 'href_Genre': 'http://www.vgchartz.com/game/65884/grand-theft-auto-v/?region=All', 'Publisher': 'Rockstar Games  ', 'Developer': 'Rockstar North  ', 'Critic_Score': 9.4, 'User_Score': nan, 'NA_Sales': 6.37, 'PAL_Sales': 9.85, 'JP_Sales': 0.99, 'Other_Sales': 3.12, 'Global_Sales': 20.32}
{'_id': ObjectId('5d00d5385024ccaf12d81390'), 'Rank': 21, 'Name': 'Grand Theft Auto V', 'Platform': 'PS4', 'Year': 2014.0, 'href_Genre': 'http://www.vgchartz.com/game/83196/grand-theft-auto-v/?region=All', 'Publisher': 'Rockstar Games  ', 'Developer': 'Rockstar North  ', 'Critic_Score': 9.7, 'User_Score': nan, 'NA_Sales': 6.06, 'PAL_Sales': 9.71, 'JP_Sales': 0.6, 'Other_Sales': 3.02, 'Global_Sales': 19.39}
{'_id': ObjectId('5d00d5385024ccaf12d8139c'), 'Rank': 34, 'Name': 'Grand Theft Auto V', 'Platform': 'X360', 'Year': 2013.0, 'href_Genre': 'http://www.vgchartz.com/game/65883/grand

As imagined there is one data entry for each platform. We are interested in the "whole" game performance, therefore we will perform the following actions:
- Group by the games according to their name.
- Sum all the global sales
- Average the Critic Score

In [21]:
vgchartz_pipeline = [
     { "$addFields": {
         "Critic_Score_norm": {"$cond": 
             { "if": { "$eq": [ "$Critic_Score", np.nan ] }, 
                   "then": "null", 
                   "else": "$Critic_Score" }},
         "User_Score_norm": {"$cond": 
             { "if": { "$eq": [ "$User_Score", np.nan ] }, 
                   "then": "null", 
                   "else": "$User_Score" }}
        }
     }, 
    { "$group": {
        "_id": "$Name",
        "Name": {"$first":"$Name"},
        "Global_sales_all_platform": {"$sum": "$Global_Sales"},
        "Year": {"$first": "$Year"},
        "Critic_score": {"$avg":"$Critic_Score_norm"},
        "User_score": {"$avg":"$User_Score_norm"},
        "Publishers": {"$push": {"$concat":["$Publisher"]}},
        "Developers": {"$push": {"$concat":["$Developer"]}},
        "Platforms": {"$push": {"$concat":["$Platform"]}}
        }
    },
       {"$sort": {"Global_sales_all_platform": -1} }]
top_vgchartz = list(top_games_vg.aggregate(vgchartz_pipeline))
top_vgchartz[0:10]

[{'_id': 'Wii Sports',
  'Name': 'Wii Sports',
  'Global_sales_all_platform': 82.65,
  'Year': 2006.0,
  'Critic_score': 7.7,
  'User_score': None,
  'Publishers': ['Nintendo  '],
  'Developers': ['Nintendo EAD  '],
  'Platforms': ['Wii']},
 {'_id': 'Grand Theft Auto V',
  'Name': 'Grand Theft Auto V',
  'Global_sales_all_platform': 65.62,
  'Year': 2013.0,
  'Critic_score': 9.366666666666667,
  'User_score': 9.0,
  'Publishers': ['Rockstar Games  ',
   'Rockstar Games  ',
   'Rockstar Games  ',
   'Rockstar Games  ',
   'Rockstar Games  '],
  'Developers': ['Rockstar North  ',
   'Rockstar North  ',
   'Rockstar North  ',
   'Rockstar North  ',
   'Rockstar North  '],
  'Platforms': ['PS3', 'PS4', 'X360', 'PC', 'XOne']},
 {'_id': 'Super Mario Bros.',
  'Name': 'Super Mario Bros.',
  'Global_sales_all_platform': 40.24,
  'Year': 1985.0,
  'Critic_score': 10.0,
  'User_score': 8.2,
  'Publishers': ['Nintendo  '],
  'Developers': ['Nintendo EAD  '],
  'Platforms': ['NES']},
 {'_id': 'Mar

In [23]:
vgchartz_df = pd.DataFrame.from_dict(top_vgchartz)
vgchartz_df.head(5)

Unnamed: 0,Critic_score,Developers,Global_sales_all_platform,Name,Platforms,Publishers,User_score,Year,_id
0,7.7,[Nintendo EAD ],82.65,Wii Sports,[Wii],[Nintendo ],,2006.0,Wii Sports
1,9.366667,"[Rockstar North , Rockstar North , Rockstar ...",65.62,Grand Theft Auto V,"[PS3, PS4, X360, PC, XOne]","[Rockstar Games , Rockstar Games , Rockstar ...",9.0,2013.0,Grand Theft Auto V
2,10.0,[Nintendo EAD ],40.24,Super Mario Bros.,[NES],[Nintendo ],8.2,1985.0,Super Mario Bros.
3,8.2,[Nintendo EAD ],35.98,Mario Kart Wii,[Wii],[Nintendo ],9.1,2008.0,Mario Kart Wii
4,,"[Bullet Proof Software , Nintendo ]",35.84,Tetris,"[GB, NES]","[Nintendo , Nintendo ]",,1989.0,Tetris


# Top N games on Vgchartz

In [9]:
vgsales = pd.read_csv("../dataset/vgsales.csv")

We want the top N games by sales. 
Vgcharts has different rows for the same game depending on the platform, but I guess we are interested in the top N games by sales on all platforms...

In [19]:
# e.g.
vgsales[vgsales['Name'] == 'Grand Theft Auto V']

Unnamed: 0,Rank,Name,Platform,Year,href_Genre,Publisher,Developer,Critic_Score,User_Score,NA_Sales,PAL_Sales,JP_Sales,Other_Sales,Global_Sales
19,20,Grand Theft Auto V,PS3,2013.0,http://www.vgchartz.com/game/65884/grand-theft...,Rockstar Games,Rockstar North,9.4,,6.37,9.85,0.99,3.12,20.32
20,21,Grand Theft Auto V,PS4,2014.0,http://www.vgchartz.com/game/83196/grand-theft...,Rockstar Games,Rockstar North,9.7,,6.06,9.71,0.6,3.02,19.39
33,34,Grand Theft Auto V,X360,2013.0,http://www.vgchartz.com/game/65883/grand-theft...,Rockstar Games,Rockstar North,,,9.06,5.33,0.06,1.42,15.86
62,63,Grand Theft Auto V,PC,2015.0,http://www.vgchartz.com/game/83197/grand-theft...,Rockstar Games,Rockstar North,,,0.48,0.76,,0.1,1.33
107,108,Grand Theft Auto V,XOne,2014.0,http://www.vgchartz.com/game/73074/grand-theft...,Rockstar Games,Rockstar North,9.0,9.0,4.7,3.25,0.01,0.76,8.72


In [20]:
platform = vgsales.groupby('Name')['Platform'].apply(', '.join)

In [21]:
cr_score = vgsales.groupby('Name')['Critic_Score'].mean()

In [22]:
sales = vgsales.groupby('Name')['Global_Sales'].sum()

In [23]:
u_score  = vgsales.groupby('Name')['User_Score'].mean()

In [24]:
pub = vgsales.groupby('Name')['Publisher'].apply(', '.join)

In [25]:
dev = vgsales.groupby('Name')['Developer'].apply(', '.join)

In [26]:
df = pd.DataFrame({'Name' : sales.index, 'Global_Sales': sales,
                   'Platform': platform,  'Critic_Score' : cr_score, 'User_Score' : u_score,
                  'Publisher' : pub, 'Developer' : dev})

In [27]:
df = df.reset_index(drop = 1)

In [28]:
df[df['Name'] == 'Grand Theft Auto V']

Unnamed: 0,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer
12801,Grand Theft Auto V,65.62,"PS3, PS4, X360, PC, XOne",9.366667,9.0,"Rockstar Games , Rockstar Games , Rockstar G...","Rockstar North , Rockstar North , Rockstar N..."


In [29]:
top_vgchartz = df.sort_values('Global_Sales', ascending = 0).reset_index(drop = 1)

In [30]:
top_vgchartz.loc[0:10]

Unnamed: 0,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer
0,Wii Sports,82.65,Wii,7.7,,Nintendo,Nintendo EAD
1,Grand Theft Auto V,65.62,"PS3, PS4, X360, PC, XOne",9.366667,9.0,"Rockstar Games , Rockstar Games , Rockstar G...","Rockstar North , Rockstar North , Rockstar N..."
2,Super Mario Bros.,40.24,"NES, VC, WiiU, 3DS",9.5,8.2,"Nintendo , Nintendo , Nintendo , Nintendo","Nintendo EAD , Nintendo EAD , Unknown , Unk..."
3,Mario Kart Wii,35.98,Wii,8.2,9.1,Nintendo,Nintendo EAD
4,Tetris,35.84,"GB, NES, WS, PSN",7.5,,"Nintendo , Nintendo , Vanguard , Electronic...","Bullet Proof Software , Nintendo , Vanguard ..."
5,Minecraft,33.83,"PC, X360, PS4, PS3, XOne, PSV, NS, WiiU, 3DS, ...",8.16,,"Mojang , Microsoft Studios , Sony Computer E...","Mojang AB , Mojang , Mojang , Mojang , Moj..."
6,Wii Sports Resort,32.9,Wii,8.0,8.8,Nintendo,Nintendo EAD
7,Pokémon Red / Green / Blue Version,31.37,GB,9.4,,Nintendo,Game Freak
8,Call of Duty: Black Ops,30.99,"X360, PS3, PC, Wii, DS, XBL, PSN",8.285714,,"Activision , Activision , Activision , Acti...","Treyarch , Treyarch , Treyarch , Treyarch ..."
9,Call of Duty: Modern Warfare 3,30.71,"X360, PS3, PC, Wii",7.5,,"Activision , Activision , Activision , Acti...","Infinity Ward , Infinity Ward , Infinity War..."


# From Vgchartz to Twitch 

In [31]:
top_vgchartz['Norm_name'] = top_vgchartz['Name'].apply(lambda x: uniformer.uniform(x))

In [32]:
merged_df = pd.merge(top_vgchartz, top_twitch_df,  left_on = 'Norm_name', right_on = 'norm_name', how = 'outer').loc[0:50]

In [33]:
merged_df

Unnamed: 0,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer,Norm_name,avgViewers,name,norm_name,twitch_id,unary_name
0,Wii Sports,82.65,Wii,7.7,,Nintendo,Nintendo EAD,wii sports,4.881188,Wii Sports,wii sports,12232.0,wii sports
1,Grand Theft Auto V,65.62,"PS3, PS4, X360, PC, XOne",9.366667,9.0,"Rockstar Games , Rockstar Games , Rockstar G...","Rockstar North , Rockstar North , Rockstar N...",grand theft auto v,94222.248958,Grand Theft Auto V,grand theft auto v,32982.0,grand theft auto v
2,Super Mario Bros.,40.24,"NES, VC, WiiU, 3DS",9.5,8.2,"Nintendo , Nintendo , Nintendo , Nintendo","Nintendo EAD , Nintendo EAD , Unknown , Unk...",super mario bros.,49.296454,Super Mario Bros.,super mario bros.,509508.0,super mario bros.
3,Mario Kart Wii,35.98,Wii,8.2,9.1,Nintendo,Nintendo EAD,mario kart wii,93.326776,Mario Kart Wii,mario kart wii,18871.0,mario kart wii
4,Tetris,35.84,"GB, NES, WS, PSN",7.5,,"Nintendo , Nintendo , Vanguard , Electronic...","Bullet Proof Software , Nintendo , Vanguard ...",tetris,50.957424,Tetris,tetris,1736.0,tetris
5,Minecraft,33.83,"PC, X360, PS4, PS3, XOne, PSV, NS, WiiU, 3DS, ...",8.16,,"Mojang , Microsoft Studios , Sony Computer E...","Mojang AB , Mojang , Mojang , Mojang , Moj...",minecraft,12378.073958,Minecraft,minecraft,27471.0,minecraft
6,Wii Sports Resort,32.9,Wii,8.0,8.8,Nintendo,Nintendo EAD,wii sports resort,1.351852,Wii Sports Resort,wii sports resort,19259.0,wii sports resort
7,Pokémon Red / Green / Blue Version,31.37,GB,9.4,,Nintendo,Game Freak,pokémon red / green / blue version,,,,,
8,Call of Duty: Black Ops,30.99,"X360, PS3, PC, Wii, DS, XBL, PSN",8.285714,,"Activision , Activision , Activision , Acti...","Treyarch , Treyarch , Treyarch , Treyarch ...",call of duty black ops,394.394931,Call of Duty: Black Ops,call of duty black ops,23894.0,call of duty black ops
9,Call of Duty: Modern Warfare 3,30.71,"X360, PS3, PC, Wii",7.5,,"Activision , Activision , Activision , Acti...","Infinity Ward , Infinity Ward , Infinity War...",call of duty modern warfare iii,29.654709,Call of Duty: Modern Warfare 3,call of duty modern warfare iii,31551.0,call of duty modern warfare iii


Let's check which matches are missing:

In [34]:
merged_df[merged_df['name'].isna()]

Unnamed: 0,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer,Norm_name,avgViewers,name,norm_name,twitch_id,unary_name
7,Pokémon Red / Green / Blue Version,31.37,GB,9.4,,Nintendo,Game Freak,pokémon red / green / blue version,,,,,
17,Call of Duty: Modern Warfare 2,25.02,"X360, PS3, PC, XBL, PSN",9.333333,9.0,"Activision , Activision , Activision , Acti...","Infinity Ward , Infinity Ward , Infinity War...",call of duty modern warfare ii,,,,,
18,Nintendogs,24.68,DS,8.4,,Nintendo,Nintendo EAD,nintendogs,,,,,
21,Pokémon Gold / Silver Version,23.1,GB,9.2,,Nintendo,Game Freak,pokémon gold / silver version,,,,,
22,Wii Fit,22.7,Wii,7.9,,Nintendo,Nintendo EAD,wii fit,,,,,
24,Kinect Adventures!,22.1,X360,6.7,,Microsoft Game Studios,Good Science Studio,kinect adventures!,,,,,
25,Wii Fit Plus,21.81,Wii,8.0,,Nintendo,Nintendo EAD,wii fit plus,,,,,
29,Brain Age: Train Your Brain in Minutes a Day,20.16,DS,8.1,,Nintendo,Nintendo SDD,brain age train your brain in minutes a day,,,,,
33,Pokémon Diamond / Pearl Version,18.25,DS,8.6,,Nintendo,Game Freak,pokémon diamond / pearl version,,,,,
45,Pokémon Ruby / Sapphire Version,15.85,GBA,8.8,,Nintendo,Game Freak,pokémon ruby / sapphire version,,,,,


- nintendogs is not on twitch
- wii fit/ wii fit plus are not on twitch
- knect adventures is not on twitch
- brain age/ brain age 2 are not on twitch

Solving the remaining discrepancies:

In [35]:
top_vgchartz.at[45, 'Norm_name'] = 'pokémon ruby/sapphire'

In [36]:
top_vgchartz.at[33, 'Norm_name'] = 'pokémon diamond/pearl'

In [37]:
top_vgchartz.at[21, 'Norm_name'] = 'pokémon gold/silver'

In [38]:
top_vgchartz.at[7, 'Norm_name'] = 'pokémon red/blue'

In [39]:
top_vgchartz.at[17, 'Norm_name'] = 'modern warfare ii'

In [40]:
top_vgchartz.at[48, 'Norm_name'] = 'fifa soccer xiii'

In [41]:
merged_df = pd.merge(top_vgchartz, top_twitch_df,  left_on = 'Norm_name', right_on = 'norm_name', how = 'outer').loc[0:50]

In [42]:
merged_df

Unnamed: 0,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer,Norm_name,avgViewers,name,norm_name,twitch_id,unary_name
0,Wii Sports,82.65,Wii,7.7,,Nintendo,Nintendo EAD,wii sports,4.881188,Wii Sports,wii sports,12232.0,wii sports
1,Grand Theft Auto V,65.62,"PS3, PS4, X360, PC, XOne",9.366667,9.0,"Rockstar Games , Rockstar Games , Rockstar G...","Rockstar North , Rockstar North , Rockstar N...",grand theft auto v,94222.248958,Grand Theft Auto V,grand theft auto v,32982.0,grand theft auto v
2,Super Mario Bros.,40.24,"NES, VC, WiiU, 3DS",9.5,8.2,"Nintendo , Nintendo , Nintendo , Nintendo","Nintendo EAD , Nintendo EAD , Unknown , Unk...",super mario bros.,49.296454,Super Mario Bros.,super mario bros.,509508.0,super mario bros.
3,Mario Kart Wii,35.98,Wii,8.2,9.1,Nintendo,Nintendo EAD,mario kart wii,93.326776,Mario Kart Wii,mario kart wii,18871.0,mario kart wii
4,Tetris,35.84,"GB, NES, WS, PSN",7.5,,"Nintendo , Nintendo , Vanguard , Electronic...","Bullet Proof Software , Nintendo , Vanguard ...",tetris,50.957424,Tetris,tetris,1736.0,tetris
5,Minecraft,33.83,"PC, X360, PS4, PS3, XOne, PSV, NS, WiiU, 3DS, ...",8.16,,"Mojang , Microsoft Studios , Sony Computer E...","Mojang AB , Mojang , Mojang , Mojang , Moj...",minecraft,12378.073958,Minecraft,minecraft,27471.0,minecraft
6,Wii Sports Resort,32.9,Wii,8.0,8.8,Nintendo,Nintendo EAD,wii sports resort,1.351852,Wii Sports Resort,wii sports resort,19259.0,wii sports resort
7,Pokémon Red / Green / Blue Version,31.37,GB,9.4,,Nintendo,Game Freak,pokémon red/blue,31.33956,Pokémon Red/Blue,pokémon red/blue,3623.0,pokémon red/blue
8,Call of Duty: Black Ops,30.99,"X360, PS3, PC, Wii, DS, XBL, PSN",8.285714,,"Activision , Activision , Activision , Acti...","Treyarch , Treyarch , Treyarch , Treyarch ...",call of duty black ops,394.394931,Call of Duty: Black Ops,call of duty black ops,23894.0,call of duty black ops
9,Call of Duty: Modern Warfare 3,30.71,"X360, PS3, PC, Wii",7.5,,"Activision , Activision , Activision , Acti...","Infinity Ward , Infinity Ward , Infinity War...",call of duty modern warfare iii,29.654709,Call of Duty: Modern Warfare 3,call of duty modern warfare iii,31551.0,call of duty modern warfare iii


Remainig missing matches:

In [43]:
merged_df[merged_df['name'].isna()]

Unnamed: 0,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer,Norm_name,avgViewers,name,norm_name,twitch_id,unary_name
18,Nintendogs,24.68,DS,8.4,,Nintendo,Nintendo EAD,nintendogs,,,,,
22,Wii Fit,22.7,Wii,7.9,,Nintendo,Nintendo EAD,wii fit,,,,,
24,Kinect Adventures!,22.1,X360,6.7,,Microsoft Game Studios,Good Science Studio,kinect adventures!,,,,,
25,Wii Fit Plus,21.81,Wii,8.0,,Nintendo,Nintendo EAD,wii fit plus,,,,,
29,Brain Age: Train Your Brain in Minutes a Day,20.16,DS,8.1,,Nintendo,Nintendo SDD,brain age train your brain in minutes a day,,,,,


# From twitch to vgchartz 

In [44]:
top_50_twitch_df = top_twitch_df.sort_values('avgViewers', ascending = False).loc[0:50]

In [45]:
merged_df2 = pd.merge(top_twitch_df,  top_vgchartz, left_on = 'norm_name', right_on = 'Norm_name', how = 'outer').loc[0:50]

In [46]:
merged_df2

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer,Norm_name
0,126368.478125,Fortnite,fortnite,33214.0,fortnite,Fortnite,2.37,"PS4, XOne, NS, PC",8.5,,"Gearbox Software , Gearbox Software , Warner...","Epic Games , Epic Games , Epic Games , Epic...",fortnite
1,117746.633333,League of Legends,league of legends,21779.0,league of legends,League of Legends,0.0,"PC, OSX",8.0,,"THQ , Riot Games","Riot Games , Riot Games",league of legends
2,94222.248958,Grand Theft Auto V,grand theft auto v,32982.0,grand theft auto v,Grand Theft Auto V,65.62,"PS3, PS4, X360, PC, XOne",9.366667,9.0,"Rockstar Games , Rockstar Games , Rockstar G...","Rockstar North , Rockstar North , Rockstar N...",grand theft auto v
3,79284.045833,Just Chatting,just chatting,509658.0,just chatting,,,,,,,,
4,68005.00625,Dota 2,dota ii,29595.0,dota ii,Dota 2,0.0,"PC, OSX, Linux",,,"Valve Corporation , Valve Corporation , Valv...","Valve , Valve , Valve",dota ii
5,65221.74375,World of Warcraft,world of warcraft,18122.0,world of warcraft,World of Warcraft,6.35,"PC, OSX",9.2,8.0,"Blizzard Entertainment , Blizzard Entertainme...","Blizzard Entertainment , Unknown",world of warcraft
6,38391.845833,Counter-Strike: Global Offensive,counter-strike global offensive,32399.0,counter-strike global offensive,Counter-Strike: Global Offensive,0.0,"PC, XBL, PSN, OSX",,,"Valve , Valve Corporation , Valve Corporatio...","Valve Corporation , Valve Corporation , Valv...",counter-strike global offensive
7,34578.984375,PLAYERUNKNOWN'S BATTLEGROUNDS,playerunknown's battlegrounds,493057.0,playerunknown's battlegrounds,PlayerUnknown's Battlegrounds,2.5,"XOne, PS4",,,"Microsoft Studios , Sony Interactive Entertai...","PUBG Corporation , PUBG Corporation",playerunknown's battlegrounds
8,34578.984375,PLAYERUNKNOWN'S BATTLEGROUNDS,playerunknown's battlegrounds,493057.0,playerunknown's battlegrounds,PLAYERUNKNOWN'S BATTLEGROUNDS,0.0,PC,,,PUBG Corporation,PUBG Corporation,playerunknown's battlegrounds
9,25637.785417,Hearthstone,hearthstone,138585.0,hearthstone,,,,,,,,


Let us check which matches are missing:

In [47]:
merged_df2[merged_df2['Name'].isna()]

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer,Norm_name
3,79284.045833,Just Chatting,just chatting,509658.0,just chatting,,,,,,,,
9,25637.785417,Hearthstone,hearthstone,138585.0,hearthstone,,,,,,,,
12,17199.91875,Auto Chess,auto chess,511748.0,auto chess,,,,,,,,
13,15865.169792,Layers of Fear 2,layers of fear ii,510090.0,layers of fear ii,,,,,,,,
16,13694.844792,Call of Duty: Black Ops 4,call of duty black ops iv,504462.0,call of duty black ops iiii,,,,,,,,
22,10517.815385,Bio Inc. Redemption,bio inc. redemption,497530.0,bio inc. redemption,,,,,,,,
23,10151.255208,Talk Shows & Podcasts,talk shows & podcasts,417752.0,talk shows & podcasts,,,,,,,,
25,8504.015625,Music & Performing Arts,music & performing arts,26936.0,music & performing arts,,,,,,,,
26,8453.771429,Clam Man,clam man,512727.0,clam man,,,,,,,,
29,6672.1,Escape From Tarkov,escape from tarkov,491931.0,escape from tarkov,,,,,,,,


- just chatting is not a game (and the same goes for 'ASMR', 'art', ecc.)
- auto chess is not vgchartz
- Layers of Fear 2	is not in vgchartz
- Bio Inc. Redemption	is not in vgchartz
- Clam Man is not in vgchartz
-  Escape From Tarkov	 is not in vgchartz
- Old School RuneScape	 is not in vgchartz
- Battalion 1944	 is not in vgchartz
- Conan Unconquered	 is not in vgchartz
- XERA: Survival	is not in vgchartz
- ASMR is not in vgchartz
- Poker is not in vgchartz
- Change: A Homeless Survival Experience is not in vgchartz

Solving the remaining discrepancies:

In [48]:
top_vgchartz[top_vgchartz['Name'] == 'Hearthstone: Heroes of Warcraft']

Unnamed: 0,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer,Norm_name
36581,Hearthstone: Heroes of Warcraft,0.0,"OSX, PC",,,"Blizzard Entertainment , Blizzard Entertainme...","Unknown , Unknown",hearthstone heroes of warcraft


In [49]:
top_vgchartz.at[36581, 'Norm_name'] = 'hearthstone'

In [50]:
top_vgchartz[top_vgchartz['Name'] == 'Call of Duty: Black Ops IIII']

Unnamed: 0,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer,Norm_name
58,Call of Duty: Black Ops IIII,14.3,"PS4, XOne, PC",,,"Activision , Activision , Activision","Treyarch , Treyarch , Treyarch",call of duty black ops iiii


In [51]:
top_vgchartz.at[58, 'Norm_name'] = 'call of duty black ops iv'

In [52]:
top_vgchartz[top_vgchartz['Name'] == 'Drive to Survive']

Unnamed: 0,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer,Norm_name
25837,Drive to Survive,0.0,PS2,,,VU Games,Supersonic,drive to survive


In [53]:
top_vgchartz.at[25837, 'Norm_name'] = 'mashed drive to survive'

In [54]:
top_vgchartz[top_vgchartz['Name'] == 'StarCraft II: Wings of Liberty']

Unnamed: 0,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer,Norm_name
383,StarCraft II: Wings of Liberty,4.86,"PC, OSX",9.3,,"Blizzard Entertainment , Blizzard Entertainme...","Blizzard Entertainment , Unknown",starcraft ii wings of liberty


In [55]:
top_vgchartz.at[383, 'Norm_name'] = 'starcraft ii'

In [56]:
top_vgchartz[top_vgchartz['Name'] == 'Worms: WMD']

Unnamed: 0,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer,Norm_name
8822,Worms: WMD,0.07,"PC, PS4, XOne, NS",,,"Team17 Digital Ltd , Team17 Digital Ltd , Te...","Team17 Digital Ltd , Team17 Digital Ltd , Te...",worms wmd


In [57]:
top_vgchartz.at[8822, 'Norm_name'] = 'worms w.m.d'

In [58]:
merged_df2 = pd.merge(top_twitch_df,  top_vgchartz, left_on = 'norm_name', right_on = 'Norm_name', how = 'outer').loc[0:50]

Remainig missing matches:

In [59]:
merged_df2[merged_df2['Norm_name'].isna()]

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name,Name,Global_Sales,Platform,Critic_Score,User_Score,Publisher,Developer,Norm_name
3,79284.045833,Just Chatting,just chatting,509658.0,just chatting,,,,,,,,
12,17199.91875,Auto Chess,auto chess,511748.0,auto chess,,,,,,,,
13,15865.169792,Layers of Fear 2,layers of fear ii,510090.0,layers of fear ii,,,,,,,,
22,10517.815385,Bio Inc. Redemption,bio inc. redemption,497530.0,bio inc. redemption,,,,,,,,
23,10151.255208,Talk Shows & Podcasts,talk shows & podcasts,417752.0,talk shows & podcasts,,,,,,,,
25,8504.015625,Music & Performing Arts,music & performing arts,26936.0,music & performing arts,,,,,,,,
26,8453.771429,Clam Man,clam man,512727.0,clam man,,,,,,,,
29,6672.1,Escape From Tarkov,escape from tarkov,491931.0,escape from tarkov,,,,,,,,
30,6097.37931,Trivia 101,trivia ci,1026.0,trivia iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii...,,,,,,,,
33,5964.107292,Art,art,509660.0,art,,,,,,,,


## Getting all games

In [60]:
final = pd.concat([merged_df, merged_df2], axis = 0, sort=True)

In [61]:
# delete the rows with missing values in norm_name or Norm_name (i.e., the ones that have not been matched)
# Or maybe we want to keep them anyway?
final = final[final['Norm_name'].notna() & final['norm_name'].notna()].sort_values('Norm_name').reset_index(drop =1)

In [62]:
dup_idx = final.duplicated(subset = 'Norm_name', keep = 'last')

In [63]:
# remove duplicate rows (e.g., GTA 5 was found in both Vgchartz and twitch)
final = final[-dup_idx].reset_index(drop = 1)

In [64]:
# these two columns are not necessary
final = final.drop(['Name', 'Norm_name'], axis = 1)

In [65]:
# reorder columns
final = final[['twitch_id', 'name', 'Global_Sales', 'avgViewers', 'Platform',
               'Publisher', 'Developer', 'Critic_Score',
               'User_Score', 'norm_name', 'unary_name']]

In [66]:
final['Global_Sales'] = final['Global_Sales'].replace(0, np.nan)

In [67]:
final = final.sort_values('avgViewers', ascending = 0)

#### (adding alternative names)

In [None]:
# alt_names = []

In [None]:
#for name in final['name']:
#    print(name)
#    alt_name = input()
#    if alt_name == '':
#        alt_name = np.nan
#    alt_names.append(alt_name)

In [None]:
# alt_names

(Saving the list just in case we use it...)

In [68]:
nan = np.nan

[nan,
 'LoL',
 'GTAV, GTA5',
 nan,
 'WoW',
 'CSGO',
 'PUGB',
 nan,
 nan,
 nan,
 nan,
 nan,
 'COD Black Ops 4, COD Black Ops IV',
 nan,
 nan,
 nan,
 'Rainbow Six Siege',
 'FIFA XIX',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'SCII, SC2, SC II, SC 2, StarCraft 2',
 nan,
 nan,
 'Worms WMD',
 'Skyrim',
 nan,
 'FF XIV, FFXIV, FF 14, FF14',
 'NBA 2019, NBA 19',
 nan,
 nan,
 nan,
 'COD Black Ops 2, COD Black Ops II',
 'GTA San Andreas',
 'COD Black Ops',
 'Cod Black Ops 3, COD Black Ops III',
 'GTA Vice City',
 'FIFA XVII',
 'COD MW2, COD MWII, COD Modern Warfare 2, COD Modern Warfare II',
 'GHIII',
 nan,
 nan,
 'FIFA XV',
 nan,
 'COD WWII',
 nan,
 nan,
 nan,
 nan,
 nan,
 'COD MW3, COD MWIII, COD Modern Warfare III, COD Modern Warfare 3',
 'FIFA XVIII',
 'COD IV Modern Warfare, COD IV MW, COD 4 MW, COD 4 Modern Warfare',
 'GTAIV, GTA IV, GTA4, GTA 4',
 nan,
 'COD Ghosts',
 nan,
 nan,
 nan,
 nan,
 'COD Advanced Warfare',
 'COD World at War',
 nan,
 nan,
 'FIFA 13, FIFA XIII',
 'FIFA XVI',
 nan,
 nan,
 nan,
 nan,
 nan,
 'FIFA XIV']

In [69]:
alt_names = [nan, 'LoL', 'GTAV, GTA5', nan, 'WoW', 'CSGO', 'PUGB', nan, nan, nan, nan, nan, 'COD Black Ops 4, COD Black Ops IV', nan, nan, nan, 'Rainbow Six Siege', 'FIFA XIX', nan, nan, nan, nan, nan, nan, 'SCII, SC2, SC II, SC 2, StarCraft 2', nan, nan, 'Worms WMD', 'Skyrim', nan, 'FF XIV, FFXIV, FF 14, FF14', 'NBA 2019, NBA 19', nan, nan, nan, 'COD Black Ops 2, COD Black Ops II', 'GTA San Andreas', 'COD Black Ops', 'Cod Black Ops 3, COD Black Ops III', 'GTA Vice City', 'FIFA XVII', 'COD MW2, COD MWII, COD Modern Warfare 2, COD Modern Warfare II', 'GHIII', nan, nan, 'FIFA XV', nan, 'COD WWII', nan, nan, nan, nan, nan, 'COD MW3, COD MWIII, COD Modern Warfare III, COD Modern Warfare 3', 'FIFA XVIII', 'COD IV Modern Warfare, COD IV MW, COD 4 MW, COD 4 Modern Warfare', 'GTAIV, GTA IV, GTA4, GTA 4', nan, 'COD Ghosts', nan, nan, nan, nan, 'COD Advanced Warfare', 'COD World at War', nan, nan, 'FIFA 13, FIFA XIII', 'FIFA XVI', nan, nan, nan, nan, nan, 'FIFA XIV']

Adding the column of alternative names

In [70]:
final['Alt_names'] = alt_names

In [71]:
final

Unnamed: 0,twitch_id,name,Global_Sales,avgViewers,Platform,Publisher,Developer,Critic_Score,User_Score,norm_name,unary_name,Alt_names
29,33214.0,Fortnite,2.37,126368.478125,"PS4, XOne, NS, PC","Gearbox Software , Gearbox Software , Warner...","Epic Games , Epic Games , Epic Games , Epic...",8.500000,,fortnite,fortnite,
36,21779.0,League of Legends,,117746.633333,"PC, OSX","THQ , Riot Games","Riot Games , Riot Games",8.000000,,league of legends,league of legends,LoL
32,32982.0,Grand Theft Auto V,65.62,94222.248958,"PS3, PS4, X360, PC, XOne","Rockstar Games , Rockstar Games , Rockstar G...","Rockstar North , Rockstar North , Rockstar N...",9.366667,9.00,grand theft auto v,grand theft auto v,"GTAV, GTA5"
19,29595.0,Dota 2,,68005.006250,"PC, OSX, Linux","Valve Corporation , Valve Corporation , Valv...","Valve , Valve , Valve",,,dota ii,dota ii,
73,18122.0,World of Warcraft,6.35,65221.743750,"PC, OSX","Blizzard Entertainment , Blizzard Entertainme...","Blizzard Entertainment , Unknown",9.200000,8.00,world of warcraft,world of warcraft,WoW
12,32399.0,Counter-Strike: Global Offensive,,38391.845833,"PC, XBL, PSN, OSX","Valve , Valve Corporation , Valve Corporatio...","Valve Corporation , Valve Corporation , Valv...",,,counter-strike global offensive,counter-strike global offensive,CSGO
49,493057.0,PLAYERUNKNOWN'S BATTLEGROUNDS,,34578.984375,PC,PUBG Corporation,PUBG Corporation,,,playerunknown's battlegrounds,playerunknown's battlegrounds,PUGB
35,138585.0,Hearthstone,,25637.785417,"OSX, PC","Blizzard Entertainment , Blizzard Entertainme...","Unknown , Unknown",,,hearthstone,hearthstone,
48,488552.0,Overwatch,8.18,19756.038542,"PS4, XOne, PC","Blizzard Entertainment , Blizzard Entertainme...","Blizzard Entertainment , Blizzard Entertainme...",9.000000,,overwatch,overwatch,
16,494364.0,Death Stranding,,17626.662139,PS4,Sony Interactive Entertainment,Kojima Productions,,,death stranding,death stranding,


In [None]:
# final.to_csv('../dataset/games_list.csv')