In [1]:
from bson.son import SON

In [2]:
from pymongo import MongoClient
import re
import sys
import json
import pandas as pd
import numpy as np

In [3]:
sys.path.insert(0, '../scripts')

In [4]:
import uniformer

## Connect to remote mongodb

In [5]:
from sshtunnel import SSHTunnelForwarder
import pymongo

In [7]:
with open('../keys.json') as k:
    keys = json.loads(k.read())
    MONGO_HOST = keys["SSH"]["MONGO_HOST"] # i.e. 10.9.13.14
    MONGO_DB = keys["SSH"]["MONGO_DB"] # i.e. dm_project 
    MONGO_USER = keys["SSH"]["MONGO_USER"] # i.e. studente
    MONGO_PASS = keys["SSH"]["MONGO_PASS"] # i.e. la password della vm

server = SSHTunnelForwarder(
    MONGO_HOST,
    ssh_username=MONGO_USER,
    ssh_password=MONGO_PASS,
    remote_bind_address=('127.0.0.1', 27017)
)

In [8]:
server.start() # remember to stop

In [9]:
client = pymongo.MongoClient('127.0.0.1', server.local_bind_port) # server.local_bind_port is assigned local port
db = client[MONGO_DB]

In [10]:
db.list_collection_names()

['system.indexes', 'twitch', 'twitter', 'vgchartz']

In [11]:
games_coll = db.twitch

## Top N games on twitch (collected between 28/05 and 30/05)

Just to check: how many times data from twitch has been collected between 28 and 30 May. We gather all the timestamps

In [12]:
pipeline = [{ "$match" : {'$and' : [
                {'timestamp' : {'$gt' : '2019-05-28'}},
                {'timestamp' : {'$lt' : '2019-05-30'}}
            ]}}, {"$project":
             {'_id': 0,
              'time':'$timestamp'} }]

In [13]:
tsmps = list(games_coll.aggregate(pipeline))

In [14]:
len(tsmps)/ 48

20.0

Data has been collected every 3 minutes, so 20 times each hour is correct.

In [15]:
pipeline = [{"$unwind": "$data"},
            { "$match" : {'$and' : [
                {'timestamp' : {'$gt' : '2019-05-28'}},
                {'timestamp' : {'$lt' : '2019-05-30'}}
            ]}},
            { "$group": {"_id": {"name": "$data.game.name",
                                 "norm_name" : "$data.game.norm_name",
                                 "unary_name" : "$data.game.unary_name",
                                 "twitch_id" : "$data.game._id"},
                         "avgViewers": { "$avg": '$data.viewers'}}},
           {"$sort": SON([("avgViewers", -1), ("_id", -1)])}]

In [16]:
top_twitch = list(db.twitch.aggregate(pipeline))

In [17]:
top_twitch[0:3]

[{'_id': {'name': 'Fortnite',
   'norm_name': 'fortnite',
   'unary_name': 'fortnite',
   'twitch_id': 33214},
  'avgViewers': 126368.478125},
 {'_id': {'name': 'League of Legends',
   'norm_name': 'league of legends',
   'unary_name': 'league of legends',
   'twitch_id': 21779},
  'avgViewers': 117746.63333333333},
 {'_id': {'name': 'Grand Theft Auto V',
   'norm_name': 'grand theft auto v',
   'unary_name': 'grand theft auto v',
   'twitch_id': 32982},
  'avgViewers': 94222.24895833334}]

In [18]:
top_twitch = [
    {'name' : elem['_id']['name'],
     'norm_name' : elem['_id']['norm_name'],
     'unary_name' : elem['_id']['unary_name'],
     'twitch_id' : elem['_id']['twitch_id'],
     'avgViewers' : elem['avgViewers']
    } for elem in top_twitch
]

In [19]:
top_twitch[0:3]

[{'name': 'Fortnite',
  'norm_name': 'fortnite',
  'unary_name': 'fortnite',
  'twitch_id': 33214,
  'avgViewers': 126368.478125},
 {'name': 'League of Legends',
  'norm_name': 'league of legends',
  'unary_name': 'league of legends',
  'twitch_id': 21779,
  'avgViewers': 117746.63333333333},
 {'name': 'Grand Theft Auto V',
  'norm_name': 'grand theft auto v',
  'unary_name': 'grand theft auto v',
  'twitch_id': 32982,
  'avgViewers': 94222.24895833334}]

In [20]:
top_twitch_df = pd.DataFrame(top_twitch)
top_twitch_df.head()

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
0,126368.478125,Fortnite,fortnite,33214,fortnite
1,117746.633333,League of Legends,league of legends,21779,league of legends
2,94222.248958,Grand Theft Auto V,grand theft auto v,32982,grand theft auto v
3,79284.045833,Just Chatting,just chatting,509658,just chatting
4,68005.00625,Dota 2,dota ii,29595,dota ii


# Top N games on Vgchartz from Mongo DB

It is necessary to execute the Notebook "Save Top N games on MongoDB" before executing the next code chunks

In [21]:
top_games_vg = db.vgchartz

Quick check for seeing if everything was loaded correctly

In [22]:
top_games_vg.find_one()

{'_id': ObjectId('5d13adcef0bd0623b0487cfe'),
 'Rank': 1,
 'Name': 'Wii Sports',
 'Platform': 'Wii',
 'Year': 2006.0,
 'href_Genre': 'http://www.vgchartz.com/game/2667/wii-sports/?region=All',
 'Publisher': 'Nintendo  ',
 'Developer': 'Nintendo EAD  ',
 'Critic_Score': 7.7,
 'User_Score': nan,
 'NA_Sales': 41.36,
 'PAL_Sales': 29.02,
 'JP_Sales': 3.77,
 'Other_Sales': 8.51,
 'Global_Sales': 82.65}

Vgchartz returns the global sales data for each game and for each Platform. Therefore the game "Grand Theft Auto V" will appear several times. Let's find out if this is correct.

In [23]:
gtaV_occurences = top_games_vg.find({"Name": "Grand Theft Auto V"})
for game in gtaV_occurences:
    print(game)

{'_id': ObjectId('5d13adcef0bd0623b0487d0f'), 'Rank': 20, 'Name': 'Grand Theft Auto V', 'Platform': 'PS3', 'Year': 2013.0, 'href_Genre': 'http://www.vgchartz.com/game/65884/grand-theft-auto-v/?region=All', 'Publisher': 'Rockstar Games  ', 'Developer': 'Rockstar North  ', 'Critic_Score': 9.4, 'User_Score': nan, 'NA_Sales': 6.37, 'PAL_Sales': 9.85, 'JP_Sales': 0.99, 'Other_Sales': 3.12, 'Global_Sales': 20.32}
{'_id': ObjectId('5d13adcef0bd0623b0487d10'), 'Rank': 21, 'Name': 'Grand Theft Auto V', 'Platform': 'PS4', 'Year': 2014.0, 'href_Genre': 'http://www.vgchartz.com/game/83196/grand-theft-auto-v/?region=All', 'Publisher': 'Rockstar Games  ', 'Developer': 'Rockstar North  ', 'Critic_Score': 9.7, 'User_Score': nan, 'NA_Sales': 6.06, 'PAL_Sales': 9.71, 'JP_Sales': 0.6, 'Other_Sales': 3.02, 'Global_Sales': 19.39}
{'_id': ObjectId('5d13adcef0bd0623b0487d1c'), 'Rank': 34, 'Name': 'Grand Theft Auto V', 'Platform': 'X360', 'Year': 2013.0, 'href_Genre': 'http://www.vgchartz.com/game/65883/grand

As imagined there is one data entry for each platform. We are interested in the "whole" game performance, therefore we will perform the following actions:
- Group by the games according to their name.
- Sum all the global sales
- Average the Critic Score
- Average the User Score
- Store all Platforms: as in the GTA V case many games were released for different platforms, therefore it is important to not lose this info after the group by
- Store all the game developers: it is possible that the games that were released for different platforms were developed by different software houses, therefore it is importat to not lose this info after the group by

In [24]:
# Works only for MongoDB 3.4 and later versions
"""vgchartz_pipeline = [
     { "$addFields": {
         "Critic_Score_norm": {"$cond": 
             { "if": { "$eq": [ "$Critic_Score", np.nan ] }, 
                   "then": "null", 
                   "else": "$Critic_Score" }},
         "User_Score_norm": {"$cond": 
             { "if": { "$eq": [ "$User_Score", np.nan ] }, 
                   "then": "null", 
                   "else": "$User_Score" }}
        }
     }, 
    { "$group": {
        "_id": "$Name",
        "Name": {"$first":"$Name"}, #This is necessary because every variable assignment should be done using group operators
        "Global_sales_all_platform": {"$sum": "$Global_Sales"},
        "Year": {"$first": "$Year"},
        "Critic_score": {"$avg":"$Critic_Score_norm"},
        "User_score": {"$avg":"$User_Score_norm"},
        "Publishers": {"$push": {"$concat":["$Publisher"]}},
        "Developers": {"$push": {"$concat":["$Developer"]}},
        "Platforms": {"$push": {"$concat":["$Platform"]}}
        }
    },
    { "$project": { "_id": 0 } },
    {"$sort": {"Global_sales_all_platform": -1} }]
top_vgchartz = list(top_games_vg.aggregate(vgchartz_pipeline))"""

'vgchartz_pipeline = [\n     { "$addFields": {\n         "Critic_Score_norm": {"$cond": \n             { "if": { "$eq": [ "$Critic_Score", np.nan ] }, \n                   "then": "null", \n                   "else": "$Critic_Score" }},\n         "User_Score_norm": {"$cond": \n             { "if": { "$eq": [ "$User_Score", np.nan ] }, \n                   "then": "null", \n                   "else": "$User_Score" }}\n        }\n     }, \n    { "$group": {\n        "_id": "$Name",\n        "Name": {"$first":"$Name"}, #This is necessary because every variable assignment should be done using group operators\n        "Global_sales_all_platform": {"$sum": "$Global_Sales"},\n        "Year": {"$first": "$Year"},\n        "Critic_score": {"$avg":"$Critic_Score_norm"},\n        "User_score": {"$avg":"$User_Score_norm"},\n        "Publishers": {"$push": {"$concat":["$Publisher"]}},\n        "Developers": {"$push": {"$concat":["$Developer"]}},\n        "Platforms": {"$push": {"$concat":["$Platfor

In [25]:
vgchartz_pipeline = [ 
    { "$group": {
        "_id": "$Name",
        "Name": {"$first":"$Name"}, #This is necessary because every variable assignment should be done using group operators
        "Global_sales_all_platform": {"$sum": "$Global_Sales"},
        "Year": {"$first": "$Year"},
        "Critic_score": {"$addToSet":"$Critic_Score"},
        "User_score": {"$addToSet":"$User_Score"},
        "Publishers": {"$push": {"$concat":["$Publisher"]}},
        "Developers": {"$push": {"$concat":["$Developer"]}},
        "Platforms": {"$push": {"$concat":["$Platform"]}}
        }
    },
    {"$sort": {"Global_sales_all_platform": -1} }]
top_vgchartz = list(top_games_vg.aggregate(vgchartz_pipeline))

In [26]:
vgchartz_df = pd.DataFrame.from_dict(top_vgchartz)
vgchartz_df.head(5)

Unnamed: 0,Critic_score,Developers,Global_sales_all_platform,Name,Platforms,Publishers,User_score,Year,_id
0,[7.7],[Nintendo EAD ],82.65,Wii Sports,[Wii],[Nintendo ],[nan],2006.0,Wii Sports
1,"[9.0, nan, 9.7, 9.4]","[Rockstar North , Rockstar North , Rockstar ...",65.62,Grand Theft Auto V,"[PS3, PS4, X360, PC, XOne]","[Rockstar Games , Rockstar Games , Rockstar ...","[9.0, nan]",2013.0,Grand Theft Auto V
2,[10.0],[Nintendo EAD ],40.24,Super Mario Bros.,[NES],[Nintendo ],[8.2],1985.0,Super Mario Bros.
3,[8.2],[Nintendo EAD ],35.98,Mario Kart Wii,[Wii],[Nintendo ],[9.1],2008.0,Mario Kart Wii
4,[nan],"[Bullet Proof Software , Nintendo ]",35.84,Tetris,"[GB, NES]","[Nintendo , Nintendo ]",[nan],1989.0,Tetris


In [27]:
vgchartz_df['Critic_score'] = vgchartz_df['Critic_score'].apply(lambda x: np.nanmean(x))
vgchartz_df.head(5)

Unnamed: 0,Critic_score,Developers,Global_sales_all_platform,Name,Platforms,Publishers,User_score,Year,_id
0,7.7,[Nintendo EAD ],82.65,Wii Sports,[Wii],[Nintendo ],[nan],2006.0,Wii Sports
1,9.366667,"[Rockstar North , Rockstar North , Rockstar ...",65.62,Grand Theft Auto V,"[PS3, PS4, X360, PC, XOne]","[Rockstar Games , Rockstar Games , Rockstar ...","[9.0, nan]",2013.0,Grand Theft Auto V
2,10.0,[Nintendo EAD ],40.24,Super Mario Bros.,[NES],[Nintendo ],[8.2],1985.0,Super Mario Bros.
3,8.2,[Nintendo EAD ],35.98,Mario Kart Wii,[Wii],[Nintendo ],[9.1],2008.0,Mario Kart Wii
4,,"[Bullet Proof Software , Nintendo ]",35.84,Tetris,"[GB, NES]","[Nintendo , Nintendo ]",[nan],1989.0,Tetris


In [28]:
vgchartz_df['User_score'] = vgchartz_df['User_score'].apply(lambda x: np.nanmean(x))
vgchartz_df.head(5)

Unnamed: 0,Critic_score,Developers,Global_sales_all_platform,Name,Platforms,Publishers,User_score,Year,_id
0,7.7,[Nintendo EAD ],82.65,Wii Sports,[Wii],[Nintendo ],,2006.0,Wii Sports
1,9.366667,"[Rockstar North , Rockstar North , Rockstar ...",65.62,Grand Theft Auto V,"[PS3, PS4, X360, PC, XOne]","[Rockstar Games , Rockstar Games , Rockstar ...",9.0,2013.0,Grand Theft Auto V
2,10.0,[Nintendo EAD ],40.24,Super Mario Bros.,[NES],[Nintendo ],8.2,1985.0,Super Mario Bros.
3,8.2,[Nintendo EAD ],35.98,Mario Kart Wii,[Wii],[Nintendo ],9.1,2008.0,Mario Kart Wii
4,,"[Bullet Proof Software , Nintendo ]",35.84,Tetris,"[GB, NES]","[Nintendo , Nintendo ]",,1989.0,Tetris


In [29]:
vgchartz_df['Developers'] = vgchartz_df['Developers'].apply(lambda x: set(x))
vgchartz_df.head(5)

Unnamed: 0,Critic_score,Developers,Global_sales_all_platform,Name,Platforms,Publishers,User_score,Year,_id
0,7.7,{Nintendo EAD },82.65,Wii Sports,[Wii],[Nintendo ],,2006.0,Wii Sports
1,9.366667,{Rockstar North },65.62,Grand Theft Auto V,"[PS3, PS4, X360, PC, XOne]","[Rockstar Games , Rockstar Games , Rockstar ...",9.0,2013.0,Grand Theft Auto V
2,10.0,{Nintendo EAD },40.24,Super Mario Bros.,[NES],[Nintendo ],8.2,1985.0,Super Mario Bros.
3,8.2,{Nintendo EAD },35.98,Mario Kart Wii,[Wii],[Nintendo ],9.1,2008.0,Mario Kart Wii
4,,"{Nintendo , Bullet Proof Software }",35.84,Tetris,"[GB, NES]","[Nintendo , Nintendo ]",,1989.0,Tetris


In [30]:
vgchartz_df['Publishers'] = vgchartz_df['Publishers'].apply(lambda x: set(x))
vgchartz_df.head(5)

Unnamed: 0,Critic_score,Developers,Global_sales_all_platform,Name,Platforms,Publishers,User_score,Year,_id
0,7.7,{Nintendo EAD },82.65,Wii Sports,[Wii],{Nintendo },,2006.0,Wii Sports
1,9.366667,{Rockstar North },65.62,Grand Theft Auto V,"[PS3, PS4, X360, PC, XOne]",{Rockstar Games },9.0,2013.0,Grand Theft Auto V
2,10.0,{Nintendo EAD },40.24,Super Mario Bros.,[NES],{Nintendo },8.2,1985.0,Super Mario Bros.
3,8.2,{Nintendo EAD },35.98,Mario Kart Wii,[Wii],{Nintendo },9.1,2008.0,Mario Kart Wii
4,,"{Nintendo , Bullet Proof Software }",35.84,Tetris,"[GB, NES]",{Nintendo },,1989.0,Tetris


# Join Vgchartz and Twitch Data

Let's join the data and see the first 25 games

In [31]:
merged_df = pd.merge(vgchartz_df, top_twitch_df,  left_on = 'Name', right_on = 'name', how = 'outer')
merged_df[['Name', 'twitch_id', 'norm_name', 'name']].loc[0:24]

Unnamed: 0,Name,twitch_id,norm_name,name
0,Wii Sports,12232.0,wii sports,Wii Sports
1,Grand Theft Auto V,32982.0,grand theft auto v,Grand Theft Auto V
2,Super Mario Bros.,509508.0,super mario bros.,Super Mario Bros.
3,Mario Kart Wii,18871.0,mario kart wii,Mario Kart Wii
4,Tetris,1736.0,tetris,Tetris
5,Minecraft,27471.0,minecraft,Minecraft
6,Wii Sports Resort,19259.0,wii sports resort,Wii Sports Resort
7,Pokémon Red / Green / Blue Version,,,
8,Call of Duty: Black Ops,23894.0,call of duty black ops,Call of Duty: Black Ops
9,Call of Duty: Modern Warfare 3,31551.0,call of duty modern warfare iii,Call of Duty: Modern Warfare 3


As you can see there may be no match between the two dataframes. In order to find more matches we try to transform the names in both dataframes into the same format, by lowering the name, deleting some punctuation and converting arabic numbers to roman. This has already been done to the Twitch data, resultinng in the field 'norm_name', so we just need to do this for the vgchartz df.

In [32]:
top_twitch_df.head(5)

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
0,126368.478125,Fortnite,fortnite,33214,fortnite
1,117746.633333,League of Legends,league of legends,21779,league of legends
2,94222.248958,Grand Theft Auto V,grand theft auto v,32982,grand theft auto v
3,79284.045833,Just Chatting,just chatting,509658,just chatting
4,68005.00625,Dota 2,dota ii,29595,dota ii


We will use a specific custom function that will "normalize" the names.

In [33]:
vgchartz_df['Norm_name_vgchartz'] = vgchartz_df['Name'].apply(lambda x: uniformer.uniform(x))

In [34]:
merged_df = pd.merge(vgchartz_df, top_twitch_df,  left_on = 'Norm_name_vgchartz', right_on = 'norm_name', how = 'outer').loc[0:50]
merged_df[['Norm_name_vgchartz', 'norm_name', 'Name', 'name', 'twitch_id']].head(10)

Unnamed: 0,Norm_name_vgchartz,norm_name,Name,name,twitch_id
0,wii sports,wii sports,Wii Sports,Wii Sports,12232.0
1,grand theft auto v,grand theft auto v,Grand Theft Auto V,Grand Theft Auto V,32982.0
2,super mario bros.,super mario bros.,Super Mario Bros.,Super Mario Bros.,509508.0
3,mario kart wii,mario kart wii,Mario Kart Wii,Mario Kart Wii,18871.0
4,tetris,tetris,Tetris,Tetris,1736.0
5,minecraft,minecraft,Minecraft,Minecraft,27471.0
6,wii sports resort,wii sports resort,Wii Sports Resort,Wii Sports Resort,19259.0
7,pokémon red / green / blue version,,Pokémon Red / Green / Blue Version,,
8,call of duty black ops,call of duty black ops,Call of Duty: Black Ops,Call of Duty: Black Ops,23894.0
9,call of duty modern warfare iii,call of duty modern warfare iii,Call of Duty: Modern Warfare 3,Call of Duty: Modern Warfare 3,31551.0


Let's check which matches are still missing:

In [35]:
merged_df[merged_df['norm_name'].isna()][['Norm_name_vgchartz']]

Unnamed: 0,Norm_name_vgchartz
7,pokémon red / green / blue version
17,call of duty modern warfare ii
18,nintendogs
21,pokémon gold / silver version
22,wii fit
24,kinect adventures!
25,wii fit plus
29,brain age train your brain in minutes a day
33,pokémon diamond / pearl version
45,pokémon ruby / sapphire version


- pokemon red / green / blue is called different

In [36]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('pokémon red')]

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
1096,31.33956,Pokémon Red/Blue,pokémon red/blue,3623,pokémon red/blue


In [37]:
vgchartz_df.loc[vgchartz_df.Norm_name_vgchartz == "pokémon red / green / blue version", "Norm_name_vgchartz"] = 'pokémon red/blue'

- call of duty modern warfare ii is called slightly different

In [38]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('modern warfare')]

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
436,165.033684,Modern Warfare 2,modern warfare ii,22393,modern warfare ii
556,109.727561,Call of Duty: Modern Warfare Remastered,call of duty modern warfare remastered,494155,call of duty modern warfare remastered
1128,29.654709,Call of Duty: Modern Warfare 3,call of duty modern warfare iii,31551,call of duty modern warfare iii
1196,27.126836,Call of Duty 4: Modern Warfare,call of duty 4 modern warfare,1964,call of duty 4 modern warfare
1365,21.397849,Call of Duty: Modern Warfare,call of duty modern warfare,512710,call of duty modern warfare
7550,0.507289,Call of Duty: Modern Warfare: Mobilized,call of duty modern warfare mobilized,24951,call of duty modern warfare mobilized


In [39]:
vgchartz_df.loc[vgchartz_df.Norm_name_vgchartz == "call of duty modern warfare ii", "Norm_name_vgchartz"] = 'modern warfare ii'

- nintendogs is not on twitch

In [40]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('nintend')]

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
54,3212.307692,Nintendo Land,nintendo land,65960,nintendo land
107,1207.275362,Monopoly for Nintendo Switch,monopoly for nintendo switch,496604,monopoly for nintendo switch
1713,14.360825,Super Smash Bros. for Nintendo 3DS,super smash bros. for nintendo 3ds,489023,super smash bros. for nintendo 3ds
2527,7.7,Pokémon for Nintendo Switch (Working Title),pokémon for nintendo switch (working title),497451,pokémon for nintendo switch (working title)
2734,6.835443,Nintendo Fan Network,nintendo fan network,491512,nintendo fan network
3679,4.273973,Taiko no Tatsujin: Nintendo Switch Version,taiko no tatsujin nintendo switch version,504454,taiko no tatsujin nintendo switch version
4756,2.709091,Nintendo Labo Variety Kit,nintendo labo variety kit,502913,nintendo labo variety kit
4989,2.444444,Dragon Quest Heroes I・II for Nintendo Switch,dragon quest heroes i・ii for nintendo switch,495059,dragon quest heroes i・ii for nintendo switch
5082,2.333333,Nintendo DSi Instrument Tuner,nintendo dsi instrument tuner,25237,nintendo dsi instrument tuner


- wii fit/ wii fit plus are not on twitch

In [41]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('wii')]

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
602,93.326776,Mario Kart Wii,mario kart wii,18871,mario kart wii
740,63.221985,Call of Duty: WWII,call of duty wwii,496712,call of duty wwii
806,54.77381,Strategic Command WWII: World at War,strategic command wwii world at war,510548,strategic command wwii world at war
1445,19.032258,PokéPark Wii: Pikachu's Adventure,poképark wii pikachu's adventure,28024,poképark wii pikachu's adventure
1969,11.296296,Wii Sports Club,wii sports club,408033,wii sports club
2043,10.728448,New Super Mario Bros. Wii,new super mario bros. wii,24238,new super mario bros. wii
3391,4.881188,Wii Sports,wii sports,12232,wii sports
4243,3.333333,Wii Party U,wii party u,110751,wii party u
4434,3.050657,Super Smash Bros. for Wii U,super smash bros. for wii u,488353,super smash bros. for wii u
6149,1.5,Mario Kart Wii U (Working Title),mario kart wii u (working title),110753,mario kart wii u (working title)


- kinect adventures is not on twitch

In [42]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('kinect')]

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
3569,4.483871,Kinect Disneyland Adventures,kinect disneyland adventures,31884,kinect disneyland adventures
5502,2.0,Kinect Rush: A Disney/Pixar Adventure,kinect rush a disney/pixar adventure,32939,kinect rush a disney/pixar adventure
5942,1.666667,Harry Potter for Kinect,harry potter for kinect,65831,harry potter for kinect
6972,1.0,Fruit Ninja Kinect 2,fruit ninja kinect ii,461397,fruit ninja kinect ii


- brain age/ brain age 2 are not on twitch

In [43]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('brain')]

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
1544,17.117647,Brain / Out,brain / out,496037,brain / out
2466,8.0,Brain Lord,brain lord,1447,brain lord
5343,2.097087,The Island of Dr. Brain,the island of dr. brain,16134,the island of dr. brain


Solving the remaining Pokemon discrepancies:

In [44]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('ruby')]

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
582,100.82994,Pokémon Ruby/Sapphire,pokémon ruby/sapphire,9287,pokémon ruby/sapphire
1600,16.106299,Pokémon Omega Ruby/Alpha Sapphire,pokémon omega ruby/alpha sapphire,460327,pokémon omega ruby/alpha sapphire
5969,1.636364,Pokémon Pinball: Ruby & Sapphire,pokémon pinball ruby & sapphire,8075,pokémon pinball ruby & sapphire


In [45]:
vgchartz_df.loc[vgchartz_df.Norm_name_vgchartz == "pokémon ruby / sapphire version", 'Norm_name_vgchartz'] = 'pokémon ruby/sapphire'

In [46]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('pearl')]

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
1549,17.008197,Pokémon Diamond/Pearl,pokémon diamond/pearl,2993,pokémon diamond/pearl


In [47]:
vgchartz_df.loc[vgchartz_df.Norm_name_vgchartz == "pokémon diamond / pearl version", "Norm_name_vgchartz"] = 'pokémon diamond/pearl'

In [48]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('pokémon gold')]

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
1114,30.186462,Pokémon Gold/Silver,pokémon gold/silver,7528,pokémon gold/silver


In [49]:
vgchartz_df.loc[vgchartz_df.Norm_name_vgchartz == "pokémon gold / silver version", "Norm_name_vgchartz"] = 'pokémon gold/silver'

- Let's see if Fifa 13 is available on twitch

In [50]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('fifa')]

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
20,10657.876042,FIFA 19,fifa xix,506103,fifa iiiiiiiiiiiiiiiiiii
245,416.171717,FIFA Soccer 97,fifa soccer xcvii,3107,fifa soccer iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii...
368,217.278912,FIFA 11,fifa xi,28500,fifa iiiiiiiiiii
379,203.572939,FIFA 17,fifa xvii,493091,fifa iiiiiiiiiiiiiiiii
601,93.448649,FIFA 15,fifa xv,460402,fifa iiiiiiiiiiiiiii
641,82.997126,FIFA Online 4,fifa online iv,504798,fifa online iiii
1173,28.10956,FIFA 18,fifa xviii,495589,fifa iiiiiiiiiiiiiiiiii
4785,2.666667,FIFA Soccer 13,fifa soccer xiii,32443,fifa soccer iiiiiiiiiiiii
5521,2.0,FIFA 2000,fifa mm,26486,fifa iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii...
5747,1.835821,FIFA 16,fifa xvi,489608,fifa iiiiiiiiiiiiiiii


In [51]:
vgchartz_df.loc[vgchartz_df.Norm_name_vgchartz == "fifa xiii", "Norm_name_vgchartz"] = 'fifa soccer xiii'

Now the main discrepancies are solved, let's rejoin the datasets

## Merge data

In [52]:
top_twitch_vgchartz = pd.merge(vgchartz_df, top_twitch_df,  left_on = 'Norm_name_vgchartz', right_on = 'norm_name', how = 'outer').loc[0:50]

In [53]:
top_twitch_vgchartz.head(25)

Unnamed: 0,Critic_score,Developers,Global_sales_all_platform,Name,Platforms,Publishers,User_score,Year,_id,Norm_name_vgchartz,avgViewers,name,norm_name,twitch_id,unary_name
0,7.7,{Nintendo EAD },82.65,Wii Sports,[Wii],{Nintendo },,2006.0,Wii Sports,wii sports,4.881188,Wii Sports,wii sports,12232.0,wii sports
1,9.366667,{Rockstar North },65.62,Grand Theft Auto V,"[PS3, PS4, X360, PC, XOne]",{Rockstar Games },9.0,2013.0,Grand Theft Auto V,grand theft auto v,94222.248958,Grand Theft Auto V,grand theft auto v,32982.0,grand theft auto v
2,10.0,{Nintendo EAD },40.24,Super Mario Bros.,[NES],{Nintendo },8.2,1985.0,Super Mario Bros.,super mario bros.,49.296454,Super Mario Bros.,super mario bros.,509508.0,super mario bros.
3,8.2,{Nintendo EAD },35.98,Mario Kart Wii,[Wii],{Nintendo },9.1,2008.0,Mario Kart Wii,mario kart wii,93.326776,Mario Kart Wii,mario kart wii,18871.0,mario kart wii
4,,"{Nintendo , Bullet Proof Software }",35.84,Tetris,"[GB, NES]",{Nintendo },,1989.0,Tetris,tetris,50.957424,Tetris,tetris,1736.0,tetris
5,7.7,"{Mojang , 4J Studios , Other Ocean Interacti...",33.83,Minecraft,"[X360, PS4, PS3, XOne, PSV, NS, WiiU, 3DS]","{Sony Computer Entertainment America , Micros...",,2013.0,Minecraft,minecraft,12378.073958,Minecraft,minecraft,27471.0,minecraft
6,8.0,{Nintendo EAD },32.9,Wii Sports Resort,[Wii],{Nintendo },8.8,2009.0,Wii Sports Resort,wii sports resort,1.351852,Wii Sports Resort,wii sports resort,19259.0,wii sports resort
7,9.4,{Game Freak },31.37,Pokémon Red / Green / Blue Version,[GB],{Nintendo },,1998.0,Pokémon Red / Green / Blue Version,pokémon red/blue,31.33956,Pokémon Red/Blue,pokémon red/blue,3623.0,pokémon red/blue
8,8.1,"{Treyarch , n-Space }",30.99,Call of Duty: Black Ops,"[X360, PS3, PC, Wii, DS]",{Activision },,2010.0,Call of Duty: Black Ops,call of duty black ops,394.394931,Call of Duty: Black Ops,call of duty black ops,23894.0,call of duty black ops
9,7.5,{Infinity Ward },30.71,Call of Duty: Modern Warfare 3,"[X360, PS3, PC, Wii]",{Activision },,2011.0,Call of Duty: Modern Warfare 3,call of duty modern warfare iii,29.654709,Call of Duty: Modern Warfare 3,call of duty modern warfare iii,31551.0,call of duty modern warfare iii


## Identifying the final list of 50 games

We have decided to monitor the following data:
- Top 25 games from vgchartz (excluding the ones that aren't on twitch)
- Top 25 games from twitch (excluding the ones that were previously selected)

### Select top 25 from vgchartz

In [54]:
top_25_vgchartz = top_twitch_vgchartz[top_twitch_vgchartz.norm_name.notnull()].reset_index().loc[0:24]

In [55]:
top_25_vgchartz

Unnamed: 0,index,Critic_score,Developers,Global_sales_all_platform,Name,Platforms,Publishers,User_score,Year,_id,Norm_name_vgchartz,avgViewers,name,norm_name,twitch_id,unary_name
0,0,7.7,{Nintendo EAD },82.65,Wii Sports,[Wii],{Nintendo },,2006.0,Wii Sports,wii sports,4.881188,Wii Sports,wii sports,12232.0,wii sports
1,1,9.366667,{Rockstar North },65.62,Grand Theft Auto V,"[PS3, PS4, X360, PC, XOne]",{Rockstar Games },9.0,2013.0,Grand Theft Auto V,grand theft auto v,94222.248958,Grand Theft Auto V,grand theft auto v,32982.0,grand theft auto v
2,2,10.0,{Nintendo EAD },40.24,Super Mario Bros.,[NES],{Nintendo },8.2,1985.0,Super Mario Bros.,super mario bros.,49.296454,Super Mario Bros.,super mario bros.,509508.0,super mario bros.
3,3,8.2,{Nintendo EAD },35.98,Mario Kart Wii,[Wii],{Nintendo },9.1,2008.0,Mario Kart Wii,mario kart wii,93.326776,Mario Kart Wii,mario kart wii,18871.0,mario kart wii
4,4,,"{Nintendo , Bullet Proof Software }",35.84,Tetris,"[GB, NES]",{Nintendo },,1989.0,Tetris,tetris,50.957424,Tetris,tetris,1736.0,tetris
5,5,7.7,"{Mojang , 4J Studios , Other Ocean Interacti...",33.83,Minecraft,"[X360, PS4, PS3, XOne, PSV, NS, WiiU, 3DS]","{Sony Computer Entertainment America , Micros...",,2013.0,Minecraft,minecraft,12378.073958,Minecraft,minecraft,27471.0,minecraft
6,6,8.0,{Nintendo EAD },32.9,Wii Sports Resort,[Wii],{Nintendo },8.8,2009.0,Wii Sports Resort,wii sports resort,1.351852,Wii Sports Resort,wii sports resort,19259.0,wii sports resort
7,7,9.4,{Game Freak },31.37,Pokémon Red / Green / Blue Version,[GB],{Nintendo },,1998.0,Pokémon Red / Green / Blue Version,pokémon red/blue,31.33956,Pokémon Red/Blue,pokémon red/blue,3623.0,pokémon red/blue
8,8,8.1,"{Treyarch , n-Space }",30.99,Call of Duty: Black Ops,"[X360, PS3, PC, Wii, DS]",{Activision },,2010.0,Call of Duty: Black Ops,call of duty black ops,394.394931,Call of Duty: Black Ops,call of duty black ops,23894.0,call of duty black ops
9,9,7.5,{Infinity Ward },30.71,Call of Duty: Modern Warfare 3,"[X360, PS3, PC, Wii]",{Activision },,2011.0,Call of Duty: Modern Warfare 3,call of duty modern warfare iii,29.654709,Call of Duty: Modern Warfare 3,call of duty modern warfare iii,31551.0,call of duty modern warfare iii


# Select top 25 games from Twitch

Extract the name from "top_25_vgchartz" and store them in the final list


In [56]:
top_50_games = set(top_25_vgchartz['name'])
len(top_50_games)

25

Keep adding games from the twitch DF, ordered by avgViewers, until 50 games are stored

In [57]:
top_twitch_df.head()

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
0,126368.478125,Fortnite,fortnite,33214,fortnite
1,117746.633333,League of Legends,league of legends,21779,league of legends
2,94222.248958,Grand Theft Auto V,grand theft auto v,32982,grand theft auto v
3,79284.045833,Just Chatting,just chatting,509658,just chatting
4,68005.00625,Dota 2,dota ii,29595,dota ii


In [58]:
index = 0
while len(top_50_games) < 50:
    game = top_twitch_df.loc[index]
    top_50_games.add(game['name'])
    index += 1

In [59]:
top_50_games

{'Apex Legends',
 'Auto Chess',
 'Bio Inc. Redemption',
 'Call of Duty: Advanced Warfare',
 'Call of Duty: Black Ops',
 'Call of Duty: Black Ops 4',
 'Call of Duty: Black Ops II',
 'Call of Duty: Black Ops III',
 'Call of Duty: Ghosts',
 'Call of Duty: Modern Warfare 3',
 'Clam Man',
 'Counter-Strike: Global Offensive',
 'Dead by Daylight',
 'Death Stranding',
 'Dota 2',
 'Duck Hunt',
 'FIFA 19',
 'Fortnite',
 'Grand Theft Auto IV',
 'Grand Theft Auto V',
 'Grand Theft Auto: San Andreas',
 'Hearthstone',
 'Just Chatting',
 'Layers of Fear 2',
 'League of Legends',
 'Magic: The Gathering',
 'Mario Kart DS',
 'Mario Kart Wii',
 'Minecraft',
 'Modern Warfare 2',
 'Music & Performing Arts',
 'New Super Mario Bros.',
 'New Super Mario Bros. Wii',
 'Overwatch',
 "PLAYERUNKNOWN'S BATTLEGROUNDS",
 'Pokémon Gold/Silver',
 'Pokémon Red/Blue',
 'Slots',
 'Splitgate: Arena Warfare',
 'Super Mario Bros.',
 'Super Mario World',
 'Talk Shows & Podcasts',
 'Tetris',
 'The Elder Scrolls V: Skyrim',
 "T

In [60]:
len(top_50_games)

50

#  Same games selection, with norm_name

Extract the normalized name from "top_25_vgchartz" and store them in the final JSON Files. This is done because it will be easier to collect data because of the lack of punctuation characters

In [61]:
top_50_games_n = set(top_25_vgchartz['norm_name'])
len(top_50_games_n)

25

Keep adding games from the twitch DF, ordered by avgViewers, until 50 games are stored

In [62]:
top_twitch_df.head()

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
0,126368.478125,Fortnite,fortnite,33214,fortnite
1,117746.633333,League of Legends,league of legends,21779,league of legends
2,94222.248958,Grand Theft Auto V,grand theft auto v,32982,grand theft auto v
3,79284.045833,Just Chatting,just chatting,509658,just chatting
4,68005.00625,Dota 2,dota ii,29595,dota ii


In [63]:
index = 0
while len(top_50_games_n) < 50:
    game = top_twitch_df.loc[index]
    top_50_games_n.add(game['norm_name'])
    index += 1

In [64]:
top_50_games_n

{'apex legends',
 'auto chess',
 'bio inc. redemption',
 'call of duty advanced warfare',
 'call of duty black ops',
 'call of duty black ops ii',
 'call of duty black ops iii',
 'call of duty black ops iv',
 'call of duty ghosts',
 'call of duty modern warfare iii',
 'clam man',
 'counter-strike global offensive',
 'dead by daylight',
 'death stranding',
 'dota ii',
 'duck hunt',
 'fifa xix',
 'fortnite',
 'grand theft auto iv',
 'grand theft auto san andreas',
 'grand theft auto v',
 'hearthstone',
 'just chatting',
 'layers of fear ii',
 'league of legends',
 'magic the gathering',
 'mario kart ds',
 'mario kart wii',
 'minecraft',
 'modern warfare ii',
 'music & performing arts',
 'new super mario bros.',
 'new super mario bros. wii',
 'overwatch',
 "playerunknown's battlegrounds",
 'pokémon gold/silver',
 'pokémon red/blue',
 'slots',
 'splitgate arena warfare',
 'super mario bros.',
 'super mario world',
 'talk shows & podcasts',
 'tetris',
 'the elder scrolls v skyrim',
 "tom cl

In [65]:
len(top_50_games_n)

50

## Let's store the list as a JSON file

In [66]:
import json

In [67]:
top50_json = {
                "selected_top_50_games": {
                    "name": list(top_50_games), 
                    "norm_name": list(top_50_games_n)
                 }
             }

In [69]:
with open("../top_50_games_to_monitor.json", "w") as filehandler:
    json.dump(top50_json, filehandler)

# Adding Norm_name to vgsales dataset

In [70]:
vgchartz_odf = pd.read_csv('../dataset/vgsales.csv')

In [71]:
vgchartz_odf.head()

Unnamed: 0,Rank,Name,Platform,Year,href_Genre,Publisher,Developer,Critic_Score,User_Score,NA_Sales,PAL_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,http://www.vgchartz.com/game/2667/wii-sports/?...,Nintendo,Nintendo EAD,7.7,,41.36,29.02,3.77,8.51,82.65
1,2,Super Mario Bros.,NES,1985.0,http://www.vgchartz.com/game/6455/super-mario-...,Nintendo,Nintendo EAD,10.0,8.2,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,http://www.vgchartz.com/game/6968/mario-kart-w...,Nintendo,Nintendo EAD,8.2,9.1,15.91,12.92,3.8,3.35,35.98
3,4,PLAYERUNKNOWN'S BATTLEGROUNDS,PC,2017.0,http://www.vgchartz.com/game/215988/playerunkn...,PUBG Corporation,PUBG Corporation,,,,,,,
4,5,Wii Sports Resort,Wii,2009.0,http://www.vgchartz.com/game/24656/wii-sports-...,Nintendo,Nintendo EAD,8.0,8.8,15.61,10.99,3.29,3.02,32.9


In [72]:
vgchartz_odf['Norm_name'] = vgchartz_odf['Name'].apply(uniformer.uniform)

In [75]:
vgchartz_odf.head()

Unnamed: 0,Rank,Name,Platform,Year,href_Genre,Publisher,Developer,Critic_Score,User_Score,NA_Sales,PAL_Sales,JP_Sales,Other_Sales,Global_Sales,Norm_name
0,1,Wii Sports,Wii,2006.0,http://www.vgchartz.com/game/2667/wii-sports/?...,Nintendo,Nintendo EAD,7.7,,41.36,29.02,3.77,8.51,82.65,wii sports
1,2,Super Mario Bros.,NES,1985.0,http://www.vgchartz.com/game/6455/super-mario-...,Nintendo,Nintendo EAD,10.0,8.2,29.08,3.58,6.81,0.77,40.24,super mario bros.
2,3,Mario Kart Wii,Wii,2008.0,http://www.vgchartz.com/game/6968/mario-kart-w...,Nintendo,Nintendo EAD,8.2,9.1,15.91,12.92,3.8,3.35,35.98,mario kart wii
3,4,PLAYERUNKNOWN'S BATTLEGROUNDS,PC,2017.0,http://www.vgchartz.com/game/215988/playerunkn...,PUBG Corporation,PUBG Corporation,,,,,,,,playerunknown's battlegrounds
4,5,Wii Sports Resort,Wii,2009.0,http://www.vgchartz.com/game/24656/wii-sports-...,Nintendo,Nintendo EAD,8.0,8.8,15.61,10.99,3.29,3.02,32.9,wii sports resort


In [74]:
for i in vgchartz_odf.index:
    n = vgchartz_odf.loc[i]['Name']
    m = vgchartz_df[vgchartz_df['Name'] == n]['Norm_name_vgchartz'].values
    if len(m) > 0:
        #print(i, n, vgchartz_df[vgchartz_df['Name'] == n]['Norm_name_vgchartz'].values[0])
        vgchartz_odf.at[i, 'Norm_name'] = m[0]

In [76]:
vgchartz_odf.head()

Unnamed: 0,Rank,Name,Platform,Year,href_Genre,Publisher,Developer,Critic_Score,User_Score,NA_Sales,PAL_Sales,JP_Sales,Other_Sales,Global_Sales,Norm_name
0,1,Wii Sports,Wii,2006.0,http://www.vgchartz.com/game/2667/wii-sports/?...,Nintendo,Nintendo EAD,7.7,,41.36,29.02,3.77,8.51,82.65,wii sports
1,2,Super Mario Bros.,NES,1985.0,http://www.vgchartz.com/game/6455/super-mario-...,Nintendo,Nintendo EAD,10.0,8.2,29.08,3.58,6.81,0.77,40.24,super mario bros.
2,3,Mario Kart Wii,Wii,2008.0,http://www.vgchartz.com/game/6968/mario-kart-w...,Nintendo,Nintendo EAD,8.2,9.1,15.91,12.92,3.8,3.35,35.98,mario kart wii
3,4,PLAYERUNKNOWN'S BATTLEGROUNDS,PC,2017.0,http://www.vgchartz.com/game/215988/playerunkn...,PUBG Corporation,PUBG Corporation,,,,,,,,playerunknown's battlegrounds
4,5,Wii Sports Resort,Wii,2009.0,http://www.vgchartz.com/game/24656/wii-sports-...,Nintendo,Nintendo EAD,8.0,8.8,15.61,10.99,3.29,3.02,32.9,wii sports resort


In [77]:
vgchartz_odf[vgchartz_odf['Norm_name'].str.contains('pokémon red')]

Unnamed: 0,Rank,Name,Platform,Year,href_Genre,Publisher,Developer,Critic_Score,User_Score,NA_Sales,PAL_Sales,JP_Sales,Other_Sales,Global_Sales,Norm_name
5,6,Pokémon Red / Green / Blue Version,GB,1998.0,http://www.vgchartz.com/game/4030/pokemon-red-...,Nintendo,Game Freak,9.4,,11.27,8.89,10.22,1.0,31.37,pokémon red/blue


In [78]:
vgchartz_odf.to_csv('../dataset/vgsales_n.csv')