In [1]:
from bson.son import SON

In [7]:
from pymongo import MongoClient
import re
import sys
import json
import pandas as pd
import numpy as np

In [8]:
sys.path.insert(0, '../scripts')

In [9]:
import uniformer

## Connect to remote mongodb

In [10]:
from sshtunnel import SSHTunnelForwarder
import pymongo

In [11]:
with open('keys.json') as k:
    keys = json.loads(k.read())
    MONGO_HOST = keys["SSH"]["MONGO_HOST"] # i.e. 10.9.13.14
    MONGO_DB = keys["SSH"]["MONGO_DB"] # i.e. dm_project 
    MONGO_USER = keys["SSH"]["MONGO_USER"] # i.e. studente
    MONGO_PASS = keys["SSH"]["MONGO_PASS"] # i.e. la password della vm

server = SSHTunnelForwarder(
    MONGO_HOST,
    ssh_username=MONGO_USER,
    ssh_password=MONGO_PASS,
    remote_bind_address=('127.0.0.1', 27017)
)

In [12]:
server.start() # remember to stop

In [13]:
client = pymongo.MongoClient('127.0.0.1', server.local_bind_port) # server.local_bind_port is assigned local port
db = client[MONGO_DB]

In [14]:
db.list_collection_names()

['system.indexes', 'twitch', 'twitter', 'vgchartz']

In [15]:
games_coll = db.twitch

## Top N games on twitch (collected between 28/05 and 30/05)

Just to check: how many times data from twitch has been collected between 28 and 30 May. We gather all the timestamps

In [16]:
pipeline = [{ "$match" : {'$and' : [
                {'timestamp' : {'$gt' : '2019-05-28'}},
                {'timestamp' : {'$lt' : '2019-05-30'}}
            ]}}, {"$project":
             {'_id': 0,
              'time':'$timestamp'} }]

In [17]:
tsmps = list(games_coll.aggregate(pipeline))

In [18]:
len(tsmps)/ 48

20.0

Data has been collected every 3 minutes, so 20 times each hour is correct.

In [19]:
pipeline = [{"$unwind": "$data"},
            { "$match" : {'$and' : [
                {'timestamp' : {'$gt' : '2019-05-28'}},
                {'timestamp' : {'$lt' : '2019-05-30'}}
            ]}},
            { "$group": {"_id": {"name": "$data.game.name",
                                 "norm_name" : "$data.game.norm_name",
                                 "unary_name" : "$data.game.unary_name",
                                 "twitch_id" : "$data.game._id"},
                         "avgViewers": { "$avg": '$data.viewers'}}},
           {"$sort": SON([("avgViewers", -1), ("_id", -1)])}]

In [20]:
top_twitch = list(db.twitch.aggregate(pipeline))

In [21]:
top_twitch[0:3]

[{'_id': {'name': 'Fortnite',
   'norm_name': 'fortnite',
   'unary_name': 'fortnite',
   'twitch_id': 33214},
  'avgViewers': 126368.478125},
 {'_id': {'name': 'League of Legends',
   'norm_name': 'league of legends',
   'unary_name': 'league of legends',
   'twitch_id': 21779},
  'avgViewers': 117746.63333333333},
 {'_id': {'name': 'Grand Theft Auto V',
   'norm_name': 'grand theft auto v',
   'unary_name': 'grand theft auto v',
   'twitch_id': 32982},
  'avgViewers': 94222.24895833334}]

In [22]:
top_twitch = [
    {'name' : elem['_id']['name'],
     'norm_name' : elem['_id']['norm_name'],
     'unary_name' : elem['_id']['unary_name'],
     'twitch_id' : elem['_id']['twitch_id'],
     'avgViewers' : elem['avgViewers']
    } for elem in top_twitch
]

In [23]:
top_twitch[0:3]

[{'name': 'Fortnite',
  'norm_name': 'fortnite',
  'unary_name': 'fortnite',
  'twitch_id': 33214,
  'avgViewers': 126368.478125},
 {'name': 'League of Legends',
  'norm_name': 'league of legends',
  'unary_name': 'league of legends',
  'twitch_id': 21779,
  'avgViewers': 117746.63333333333},
 {'name': 'Grand Theft Auto V',
  'norm_name': 'grand theft auto v',
  'unary_name': 'grand theft auto v',
  'twitch_id': 32982,
  'avgViewers': 94222.24895833334}]

In [24]:
top_twitch_df = pd.DataFrame(top_twitch)
top_twitch_df.head()

Unnamed: 0,avgViewers,name,norm_name,twitch_id,unary_name
0,126368.478125,Fortnite,fortnite,33214,fortnite
1,117746.633333,League of Legends,league of legends,21779,league of legends
2,94222.248958,Grand Theft Auto V,grand theft auto v,32982,grand theft auto v
3,79284.045833,Just Chatting,just chatting,509658,just chatting
4,68005.00625,Dota 2,dota ii,29595,dota ii


# Top N games on Vgchartz from Mongo DB

It is necessary to execute the Notebook "Save Top N games on MongoDB" before executing the next code chunks

In [25]:
top_games_vg = db.vgchartz

Quick check for seeing if everything was loaded correctly

In [26]:
top_games_vg.find_one()

{'_id': ObjectId('5d13adcef0bd0623b0487cfe'),
 'Rank': 1,
 'Name': 'Wii Sports',
 'Platform': 'Wii',
 'Year': 2006.0,
 'href_Genre': 'http://www.vgchartz.com/game/2667/wii-sports/?region=All',
 'Publisher': 'Nintendo  ',
 'Developer': 'Nintendo EAD  ',
 'Critic_Score': 7.7,
 'User_Score': nan,
 'NA_Sales': 41.36,
 'PAL_Sales': 29.02,
 'JP_Sales': 3.77,
 'Other_Sales': 8.51,
 'Global_Sales': 82.65}

Vgchartz returns the global sales data for each game and for each Platform. Therefore the game "Grand Theft Auto V" will appear several times. Let's find out if this is correct.

In [27]:
gtaV_occurences = top_games_vg.find({"Name": "Grand Theft Auto V"})
for game in gtaV_occurences:
    print(game)

{'_id': ObjectId('5d13adcef0bd0623b0487d0f'), 'Rank': 20, 'Name': 'Grand Theft Auto V', 'Platform': 'PS3', 'Year': 2013.0, 'href_Genre': 'http://www.vgchartz.com/game/65884/grand-theft-auto-v/?region=All', 'Publisher': 'Rockstar Games  ', 'Developer': 'Rockstar North  ', 'Critic_Score': 9.4, 'User_Score': nan, 'NA_Sales': 6.37, 'PAL_Sales': 9.85, 'JP_Sales': 0.99, 'Other_Sales': 3.12, 'Global_Sales': 20.32}
{'_id': ObjectId('5d13adcef0bd0623b0487d10'), 'Rank': 21, 'Name': 'Grand Theft Auto V', 'Platform': 'PS4', 'Year': 2014.0, 'href_Genre': 'http://www.vgchartz.com/game/83196/grand-theft-auto-v/?region=All', 'Publisher': 'Rockstar Games  ', 'Developer': 'Rockstar North  ', 'Critic_Score': 9.7, 'User_Score': nan, 'NA_Sales': 6.06, 'PAL_Sales': 9.71, 'JP_Sales': 0.6, 'Other_Sales': 3.02, 'Global_Sales': 19.39}
{'_id': ObjectId('5d13adcef0bd0623b0487d1c'), 'Rank': 34, 'Name': 'Grand Theft Auto V', 'Platform': 'X360', 'Year': 2013.0, 'href_Genre': 'http://www.vgchartz.com/game/65883/grand

As imagined there is one data entry for each platform. We are interested in the "whole" game performance, therefore we will perform the following actions:
- Group by the games according to their name.
- Sum all the global sales
- Average the Critic Score
- Average the User Score
- Store all Platforms: as in the GTA V case many games were released for different platforms, therefore it is important to not lose this info after the group by
- Store all the game developers: it is possible that the games that were released for different platforms were developed by different software houses, therefore it is importat to not lose this info after the group by


The following pipeline perform as first operation the creation of two attributes (Critic_Score_norm and User_Score_norm) where all the NaN were replaced with null. In this way they aren't considered the average operation doesn't consider them.

In [None]:
vgchartz_pipeline = [
     { "$addFields": {
         "Critic_Score_norm": {"$cond": 
             { "if": { "$eq": [ "$Critic_Score", np.nan ] }, 
                   "then": "null", 
                   "else": "$Critic_Score" }},
         "User_Score_norm": {"$cond": 
             { "if": { "$eq": [ "$User_Score", np.nan ] }, 
                   "then": "null", 
                   "else": "$User_Score" }}
        }
     }, 
    { "$group": {
        "_id": "$Name",
        "Name": {"$first":"$Name"}, #This is necessary because every variable assignment should be done using group operators
        "Global_sales_all_platform": {"$sum": "$Global_Sales"},
        "Year": {"$first": "$Year"},
        "Critic_score": {"$avg":"$Critic_Score_norm"},
        "User_score": {"$avg":"$User_Score_norm"},
        "Publishers": {"$push": {"$concat":["$Publisher"]}},
        "Developers": {"$push": {"$concat":["$Developer"]}},
        "Platforms": {"$push": {"$concat":["$Platform"]}}
        }
    },
    { "$project": { "_id": 0 } },
    {"$sort": {"Global_sales_all_platform": -1} }]
top_vgchartz = list(top_games_vg.aggregate(vgchartz_pipeline))

In [None]:
vgchartz_df = pd.DataFrame.from_dict(top_vgchartz)
vgchartz_df.head(5)

# Join Vgchartz and Twitch Data

Let's join the data and see the first 25 games

In [None]:
merged_df = pd.merge(vgchartz_df, top_twitch_df,  left_on = 'Name', right_on = 'name', how = 'outer')
merged_df[['Name', 'twitch_id', 'norm_name', 'name']].loc[0:24]

As you can see there may be no match between the two dataframes. In order to find more matches we try to transform the names in both dataframes into the same format, by lowering the name, deleting some punctuation and converting arabic numbers to roman. This has already been done to the Twitch data, resultinng in the field 'norm_name', so we just need to do this for the vgchartz df.

In [None]:
top_twitch_df.head(5)

We will use a specific custom function that will "normalize" the names.

In [None]:
vgchartz_df['Norm_name_vgchartz'] = vgchartz_df['Name'].apply(lambda x: uniformer.uniform(x))

In [None]:
merged_df = pd.merge(vgchartz_df, top_twitch_df,  left_on = 'Norm_name_vgchartz', right_on = 'norm_name', how = 'outer').loc[0:50]
merged_df[['Norm_name_vgchartz', 'norm_name', 'Name', 'name', 'twitch_id']]

Let's check which matches are still missing:

In [None]:
merged_df[merged_df['norm_name'].isna()][['Norm_name_vgchartz']]

- pokemon red / green / blue is called different

In [None]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('pokémon red')]

Because I can't see them I will try a broader query on twitch collection and try to retrieve the names

In [None]:
vgchartz_df.loc[vgchartz_df.Norm_name_vgchartz == "pokémon red / green / blue version", "Norm_name_vgchartz"] = 'pokémon red/blue'

- call of duty modern warfare ii is called slightly different

In [None]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('modern warfare')]

In [None]:
vgchartz_df.loc[vgchartz_df.Norm_name_vgchartz == "call of duty modern warfare ii", "Norm_name_vgchartz"] = 'modern warfare ii'

- nintendogs is not on twitch

In [None]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('nintend')]

- wii fit/ wii fit plus are not on twitch

In [None]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('wii')]

- knect adventures is not on twitch

In [None]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('kinect')]

- brain age/ brain age 2 are not on twitch

In [None]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('brain')]

Solving the remaining Pokemon discrepancies:

In [None]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('ruby')]

In [None]:
vgchartz_df.loc[vgchartz_df.Norm_name_vgchartz == "pokémon ruby / sapphire version", 'Norm_name_vgchartz'] = 'pokémon ruby/sapphire'

In [None]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('pearl')]

In [None]:
vgchartz_df.loc[vgchartz_df.Norm_name_vgchartz == "pokémon diamond / pearl version", "Norm_name_vgchartz"] = 'pokémon diamond/pearl'

In [None]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('pokémon gold')]

In [None]:
vgchartz_df.loc[vgchartz_df.Norm_name_vgchartz == "pokémon gold / silver version", "Norm_name_vgchartz"] = 'pokémon gold/silver'

- Let's see if Fifa 13 is available on twitch

In [None]:
top_twitch_df[top_twitch_df['norm_name'].str.contains('fifa')]

In [None]:
vgchartz_df.loc[vgchartz_df.Norm_name_vgchartz == "fifa xiii", "Norm_name_vgchartz"] = 'fifa soccer xiii'

Now the main discrepancies are solved, let's rejoin the datasets

## Merge data

In [None]:
top_twitch_vgchartz = pd.merge(vgchartz_df, top_twitch_df,  left_on = 'Norm_name_vgchartz', right_on = 'norm_name', how = 'outer').loc[0:50]

In [None]:
top_twitch_vgchartz.head(25)

## Identifying the final list of 50 games

We have decided to monitor the following data:
- Top 25 games from vgchartz (excluding the ones that aren't on twitch)
- Top 25 games from twitch (excluding the ones that were previously selected)

### Select top 25 from vgchartz

In [None]:
top_25_vgchartz = top_twitch_vgchartz[top_twitch_vgchartz.norm_name.notnull()].reset_index().loc[0:24]

In [None]:
top_25_vgchartz

# Select top 25 games from Twitch

Extract the name from "top_25_vgchartz" and store them in the final list


In [None]:
top_50_games = set(top_25_vgchartz['name'])
len(top_50_games)

Keep adding games from the twitch DF, ordered by avgViewers, until 50 games are stored

In [None]:
top_twitch_df.head()

In [None]:
index = 0
while len(top_50_games) < 50:
    game = top_twitch_df.loc[index]
    top_50_games.add(game['name'])
    index += 1

In [None]:
top_50_games

In [None]:
len(top_50_games)

#  Same games selection, with norm_name

Extract the normalized name from "top_25_vgchartz" and store them in the final JSON Files. This is done because it will be easier to collect data because of the lack of punctuation characters

In [None]:
top_50_games_n = set(top_25_vgchartz['norm_name'])
len(top_50_games_n)

Keep adding games from the twitch DF, ordered by avgViewers, until 50 games are stored

In [None]:
top_twitch_df.head()

In [None]:
index = 0
while len(top_50_games_n) < 50:
    game = top_twitch_df.loc[index]
    top_50_games_n.add(game['norm_name'])
    index += 1

In [None]:
top_50_games_n

In [None]:
len(top_50_games_n)

## Let's store the list as a JSON file

In [None]:
import json

In [None]:
top50_json = {
                "selected_top_50_games": {
                    "name": list(top_50_games), 
                    "norm_name": list(top_50_games_n)
                 }
             }

In [None]:
with open("../top_50_games_to_monitor.json", "w") as filehandler:
    json.dump(top50_json, filehandler)

# Adding Norm_name to vgsales dataset

In [None]:
vgchartz_odf = pd.read_csv('../dataset/vgsales.csv')

In [None]:
vgchartz_odf.head()

In [None]:
vgchartz_odf['Norm_name'] = vgchartz_odf['Name'].apply(uniformer.uniform)

In [None]:
vgchartz_odf.head()

In [None]:
for i in vgchartz_odf.index:
    n = vgchartz_odf.loc[i]['Name']
    m = vgchartz_df[vgchartz_df['Name'] == n]['Norm_name_vgchartz'].values
    if len(m) > 0:
        #print(i, n, vgchartz_df[vgchartz_df['Name'] == n]['Norm_name_vgchartz'].values[0])
        vgchartz_odf.at[i, 'Norm_name'] = m[0]

In [None]:
vgchartz_odf

In [None]:
vgchartz_odf[vgchartz_odf['Norm_name'].str.contains('pokémon red')]

In [None]:
vgchartz_odf.to_csv('../dataset/vgsales_n.csv')