In [1]:
from bson.objectid import ObjectId
from bson.son import SON
from datetime import datetime
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from pymongo import MongoClient
import re
from sshtunnel import SSHTunnelForwarder

# Connect to MongoDB

In [2]:
with open('../keys.json') as k:
    keys = json.loads(k.read())
    MONGO_HOST = keys["SSH"]["MONGO_HOST"] # i.e. 10.9.13.14
    MONGO_DB = keys["SSH"]["MONGO_DB"] # i.e. dm_project 
    MONGO_USER = keys["SSH"]["MONGO_USER"] # i.e. studente
    MONGO_PASS = keys["SSH"]["MONGO_PASS"] # i.e. la password della vm
    

In [3]:
server = SSHTunnelForwarder(
    MONGO_HOST,
    ssh_username=MONGO_USER,
    ssh_password=MONGO_PASS,
    remote_bind_address=('127.0.0.1', 27017)
)

In [4]:
server.start() # remember to stop

client = MongoClient('127.0.0.1', server.local_bind_port) # server.local_bind_port is assigned local port
db = client[MONGO_DB]
db.list_collection_names()

['system.indexes', 'twitch', 'twitter', 'vgchartz']

#### Count of documents in each collection

Twitch:

In [5]:
db.twitch.count()

7926

Twitter:

In [6]:
db.twitter.count()

80475

#### Sizes of collections

Twitch:

In [7]:
print(db.command("collstats", 'twitch')['size']  / 1024 / 1024, 'MB')

7158.011535644531 MB


Twitter:

In [8]:
print(db.command("collstats", 'twitter')['size'] / 1024 / 1024, 'MB')

36.58821105957031 MB


# Queries on Twitch Collection

We can load the list of games we are monitoring, in case we want to restrict our queries to these games:

In [9]:
with open("../top_50_games_to_monitor.json", "r") as f:
    game_list = json.load(f)
    top_games = game_list["selected_top_50_games"]["name"]

top_games[:5]

['Fortnite',
 'Call of Duty: Black Ops II',
 'Grand Theft Auto: San Andreas',
 'Clam Man',
 'Wii Play']

### Average daily views, channels and popularity for each game in the top_games list

The following query returns the average daily viewers, channels and popularity for each game in the list of monitored games

In [10]:
# MongoDB version >= 4
# pipeline = [{"$unwind": "$data"},
#             {"$match" : { "data.game_name" : { "$in" : top_games },  }},
#             { "$group": {
#                 "_id": {"name": "$data.game_name",
#                         "day": { "$dayOfMonth": {"$toDate" : "$timestamp" }},
#                         "month": { "$month": {"$toDate" : "$timestamp" }},
#                         "year": { "$year": {"$toDate" : "$timestamp" }} },
#                 "avgViewers": { "$avg": '$data.viewers'},
#                            "avgChannels": { "$avg": '$data.channels'},
#                            "avgPopularity": { "$avg": '$data.game_popularity'}
#             }},
#             {"$sort": {"_id": 1} }]

In [11]:
twitch_daily_avg_pipe = [{"$unwind": "$data"},
                        {"$match" : { "data.game_name" : { "$in" : top_games },  }},
                        { "$group": {
                            "_id": {"name": "$data.game_name",
                                    "norm_name": "$data.game_norm_name",
                                    "year" :  { "$substr" : ["$timestamp", 0, 4 ] }, 
                                    "month" : { "$substr" : ["$timestamp", 5, 2 ] },     
                                    "day" :   { "$substr" : ["$timestamp", 8, 2 ] } },
                            "avgViewers": { "$avg": '$data.viewers'},
                            "avgChannels": { "$avg": '$data.channels'},
                            "avgPopularity": { "$avg": '$data.game_popularity'}
                            }},
                        {"$sort": {"_id": 1} }]

In [12]:
twitch_daily_avg = list(db.twitch.aggregate(twitch_daily_avg_pipe))

In [13]:
twitch_daily_avg[:5]

[{'_id': {'name': 'Apex Legends',
   'norm_name': 'apex legends',
   'year': '2019',
   'month': '06',
   'day': '13'},
  'avgViewers': 17491.647058823528,
  'avgChannels': 1434.0761245674742,
  'avgPopularity': 17115.401384083045},
 {'_id': {'name': 'Apex Legends',
   'norm_name': 'apex legends',
   'year': '2019',
   'month': '06',
   'day': '14'},
  'avgViewers': 15606.502083333333,
  'avgChannels': 1634.5229166666666,
  'avgPopularity': 15777.739583333334},
 {'_id': {'name': 'Apex Legends',
   'norm_name': 'apex legends',
   'year': '2019',
   'month': '06',
   'day': '15'},
  'avgViewers': 15781.329166666666,
  'avgChannels': 1643.1083333333333,
  'avgPopularity': 15442.404166666667},
 {'_id': {'name': 'Apex Legends',
   'norm_name': 'apex legends',
   'year': '2019',
   'month': '06',
   'day': '16'},
  'avgViewers': 18600.82463465553,
  'avgChannels': 1656.6492693110647,
  'avgPopularity': 18946.350730688937},
 {'_id': {'name': 'Apex Legends',
   'norm_name': 'apex legends',
   

The following are the games initially chosen as top games, but that have not been collected again on twitch.

In [14]:
set(top_games) - set([elem['_id']['name'] for elem in twitch_daily_avg])

{'Clam Man'}

It is easier to visualize the result of the query as a dataframe:

In [15]:
twitch_stats_df = json_normalize(twitch_daily_avg)

In [16]:
twitch_stats_df = twitch_stats_df.rename(index=str,
                                         columns={"_id.name": "name", "_id.norm_name": "norm_name",
                                                  "_id.day": "day", "_id.month": "month","_id.year": "year"})
twitch_stats_df = twitch_stats_df[['name', "norm_name", 'day', 'month', 'year', 'avgViewers', 'avgPopularity', 'avgChannels']]

In [17]:
twitch_stats_df.head()

Unnamed: 0,name,norm_name,day,month,year,avgViewers,avgPopularity,avgChannels
0,Apex Legends,apex legends,13,6,2019,17491.647059,17115.401384,1434.076125
1,Apex Legends,apex legends,14,6,2019,15606.502083,15777.739583,1634.522917
2,Apex Legends,apex legends,15,6,2019,15781.329167,15442.404167,1643.108333
3,Apex Legends,apex legends,16,6,2019,18600.824635,18946.350731,1656.649269
4,Apex Legends,apex legends,17,6,2019,19137.74375,18741.535417,1555.7125


### Average daily views, channels and popularity for every game streamed on twitch

We can also query all games, not just those in the list:

In [18]:
all_twitch_daily_avg_pipe = [{"$unwind": "$data"},
                        { "$group": {
                            "_id": {"name": "$data.game_name",
                                    "year" :  { "$substr" : ["$timestamp", 0, 4 ] }, 
                                    "month" : { "$substr" : ["$timestamp", 5, 2 ] },     
                                    "day" :   { "$substr" : ["$timestamp", 8, 2 ] } },
                            "avgViewers": { "$avg": '$data.viewers'},
                            "avgChannels": { "$avg": '$data.channels'},
                            "avgPopularity": { "$avg": '$data.game_popularity'}
                            }},
                        {"$sort": {"_id": 1} }]

In [19]:
all_twitch_daily_avg = list(db.twitch.aggregate(all_twitch_daily_avg_pipe))

In [20]:
all_twitch_daily_avg[:5]

[{'_id': {'name': ' 8 To Glory - The Official Game of the PBR',
   'year': '2019',
   'month': '06',
   'day': '14'},
  'avgViewers': 1.0,
  'avgChannels': 1.0,
  'avgPopularity': 0.0},
 {'_id': {'name': ' 8 To Glory - The Official Game of the PBR',
   'year': '2019',
   'month': '06',
   'day': '22'},
  'avgViewers': 5.0,
  'avgChannels': 1.0,
  'avgPopularity': 0.0},
 {'_id': {'name': ' F1 2019', 'year': '2019', 'month': '06', 'day': '20'},
  'avgViewers': 22.821428571428573,
  'avgChannels': 1.0,
  'avgPopularity': 20.0},
 {'_id': {'name': ' F1 2019', 'year': '2019', 'month': '06', 'day': '21'},
  'avgViewers': 481.517571884984,
  'avgChannels': 4.696485623003195,
  'avgPopularity': 482.17891373801916},
 {'_id': {'name': ' F1 2019', 'year': '2019', 'month': '06', 'day': '22'},
  'avgViewers': 207.31914893617022,
  'avgChannels': 2.5379939209726445,
  'avgPopularity': 201.72036474164133}]

In [21]:
all_twitch_daily_df = json_normalize(all_twitch_daily_avg)

In [22]:
all_twitch_daily_df = all_twitch_daily_df.rename(index=str,
                                         columns={"_id.name": "name", "_id.day": "day",
                                                  "_id.month": "month","_id.year": "year"})
all_twitch_daily_df = all_twitch_daily_df[['name', 'day', 'month', 'year', 'avgViewers', 'avgPopularity', 'avgChannels']]

In [23]:
all_twitch_daily_df.head()

Unnamed: 0,name,day,month,year,avgViewers,avgPopularity,avgChannels
0,8 To Glory - The Official Game of the PBR,14,6,2019,1.0,0.0,1.0
1,8 To Glory - The Official Game of the PBR,22,6,2019,5.0,0.0,1.0
2,F1 2019,20,6,2019,22.821429,20.0,1.0
3,F1 2019,21,6,2019,481.517572,482.178914,4.696486
4,F1 2019,22,6,2019,207.319149,201.720365,2.537994


We add the column 'daily_top', which is the game's rank in the ordered list of daily most spectated games on average (with respect to every game streamed on Twitch).

In [24]:
all_twitch_daily_df['daily_top'] = all_twitch_daily_df.sort_values(['day', 'month', 'year', 'avgViewers'], 
                                                           ascending=[True, True, True, False]).groupby(['day', 'month', 'year']).cumcount()
all_twitch_daily_df['daily_top'] = all_twitch_daily_df['daily_top'] + 1

In [25]:
all_twitch_daily_df.sort_values(['day', 'month', 'year', 'avgViewers'], ascending=[True, True, True, False]).head()

Unnamed: 0,name,day,month,year,avgViewers,avgPopularity,avgChannels,daily_top
42599,League of Legends,13,6,2019,139492.283737,140292.190311,2966.896194,1
29062,Fortnite,13,6,2019,117869.685121,116048.532872,7186.041522,2
32505,Grand Theft Auto V,13,6,2019,97704.619377,95741.33564,1057.024221,3
39364,Just Chatting,13,6,2019,97585.868512,97702.740484,1351.307958,4
21176,Dota 2,13,6,2019,69030.17301,69218.449827,756.50519,5


In [26]:
all_twitch_daily_df[all_twitch_daily_df['name'] == 'Tetris']

Unnamed: 0,name,day,month,year,avgViewers,avgPopularity,avgChannels,daily_top
78899,Tetris,13,6,2019,49.73913,48.155797,6.847826,691
78900,Tetris,14,6,2019,35.179104,35.605544,7.695096,877
78901,Tetris,15,6,2019,31.95207,32.525054,6.893246,972
78902,Tetris,16,6,2019,51.653595,47.714597,6.472767,757
78903,Tetris,17,6,2019,59.634409,65.215054,5.6,656
78904,Tetris,18,6,2019,54.09628,57.597374,6.601751,712
78905,Tetris,19,6,2019,66.323913,63.771739,5.093478,615
78906,Tetris,20,6,2019,29.947712,29.187364,6.485839,946
78907,Tetris,21,6,2019,53.063596,53.901316,6.050439,682
78908,Tetris,22,6,2019,44.438053,40.663717,5.473451,808


### Daily statistics for a single game

In [27]:
mario_daily_pipe = [{"$unwind": "$data"},
                    {"$match" : { "data.game_name" : "Super Mario Bros."  }},
                    { "$group": {
                        "_id": {"name": "$data.game_name",
                                "year" :  { "$substr" : ["$timestamp", 0, 4 ] }, 
                                "month" : { "$substr" : ["$timestamp", 5, 2 ] },     
                                "day" :   { "$substr" : ["$timestamp", 8, 2 ] } },
                        "avgViewers": { "$avg": '$data.viewers'},
                        "avgChannels": { "$avg": '$data.channels'},
                        "avgPopularity": { "$avg": '$data.game_popularity'}
                        }},
                    {"$sort": {"_id": 1} }]

In [28]:
mario_daily = list(db.twitch.aggregate(mario_daily_pipe))

In [29]:
mario_daily[:3]

[{'_id': {'name': 'Super Mario Bros.',
   'year': '2019',
   'month': '06',
   'day': '13'},
  'avgViewers': 51.2027027027027,
  'avgChannels': 3.4414414414414414,
  'avgPopularity': 44.531531531531535},
 {'_id': {'name': 'Super Mario Bros.',
   'year': '2019',
   'month': '06',
   'day': '14'},
  'avgViewers': 13.919014084507042,
  'avgChannels': 2.9119718309859155,
  'avgPopularity': 12.535211267605634},
 {'_id': {'name': 'Super Mario Bros.',
   'year': '2019',
   'month': '06',
   'day': '15'},
  'avgViewers': 74.43236714975845,
  'avgChannels': 3.7028985507246377,
  'avgPopularity': 72.28985507246377}]

### Every document pertaining a single game

In [30]:
mario_all = list(db.twitch.find({'data.game_name': 'Super Mario Bros.'},
               {'timestamp':1, 'data.$': 1}))

In [31]:
mario_all[0:3]

[{'_id': ObjectId('5d0f44df7c6c727793bcd374'),
  'data': [{'game_giantbomb_id': 15544,
    'channels': 2,
    'game_popularity': 3,
    'game_logo_large': 'https://static-cdn.jtvnw.net/ttv-logoart/Super%20Mario%20Bros.-240x144.jpg',
    'game__id': 509508,
    'game_box_large': 'https://static-cdn.jtvnw.net/ttv-boxart/Super%20Mario%20Bros.-272x380.jpg',
    'game_norm_name': 'super mario bros.',
    'game_name': 'Super Mario Bros.',
    'viewers': 2}],
  'timestamp': '2019-06-23 09:22:34.124005'},
 {'_id': ObjectId('5d0f45937c6c727793bcd375'),
  'data': [{'game_giantbomb_id': 15544,
    'channels': 2,
    'game_popularity': 2,
    'game_logo_large': 'https://static-cdn.jtvnw.net/ttv-logoart/Super%20Mario%20Bros.-240x144.jpg',
    'game__id': 509508,
    'game_box_large': 'https://static-cdn.jtvnw.net/ttv-boxart/Super%20Mario%20Bros.-272x380.jpg',
    'game_norm_name': 'super mario bros.',
    'game_name': 'Super Mario Bros.',
    'viewers': 3}],
  'timestamp': '2019-06-23 09:25:34.0781

### Query on a single game and day

We can compile a regex in order to find all documents regarding, for instance, 20/06/2019.

In [32]:
day = re.compile("^2019-06-20 \d+:\d+:\d+.\d+$")

In [33]:
mario_0620 = list(db.twitch.find({'data.game_name': 'Super Mario Bros.',
                     'timestamp':  {"$regex" : day  }},
                    {'timestamp':1, 'data.$': 1}))

In [34]:
mario_0620[:3]

[{'_id': ObjectId('5d0f4d10f0bd064ad83b1f8d'),
  'timestamp': '2019-06-20 00:02:34.009747',
  'data': [{'channels': 18,
    'game__id': 509508,
    'game_box_large': 'https://static-cdn.jtvnw.net/ttv-boxart/Super%20Mario%20Bros.-272x380.jpg',
    'game_giantbomb_id': 15544,
    'game_logo_large': 'https://static-cdn.jtvnw.net/ttv-logoart/Super%20Mario%20Bros.-240x144.jpg',
    'game_name': 'Super Mario Bros.',
    'game_norm_name': 'super mario bros.',
    'game_popularity': 416,
    'viewers': 366}]},
 {'_id': ObjectId('5d0f4d11f0bd064ad83b1f8e'),
  'timestamp': '2019-06-20 00:05:34.004106',
  'data': [{'channels': 19,
    'game__id': 509508,
    'game_box_large': 'https://static-cdn.jtvnw.net/ttv-boxart/Super%20Mario%20Bros.-272x380.jpg',
    'game_giantbomb_id': 15544,
    'game_logo_large': 'https://static-cdn.jtvnw.net/ttv-logoart/Super%20Mario%20Bros.-240x144.jpg',
    'game_name': 'Super Mario Bros.',
    'game_norm_name': 'super mario bros.',
    'game_popularity': 416,
    'vi

### Statistics for a single day

In [35]:
top_twitch_0620_pipe = [{"$unwind": "$data"},
                        {"$match" : {"$and":
                                     [{ "data.game_name" : { "$in" : top_games }  },
                                      {'timestamp':  {"$regex" : day}}] }},
                        { "$group": {"_id":
                                     {"name": "$data.game_name"},
                                     "avgViewers": { "$avg": '$data.viewers'},
                                     "avgChannels": { "$avg": '$data.channels'},
                                     "avgPopularity": { "$avg": '$data.game_popularity'}
                                     }},
                        {"$sort": {"_id": 1} }]

In [36]:
top_twitch_0620 = list(db.twitch.aggregate(top_twitch_0620_pipe))

In [37]:
top_twitch_0620[:5]

[{'_id': {'name': 'Apex Legends'},
  'avgViewers': 15973.658333333333,
  'avgChannels': 1600.80625,
  'avgPopularity': 16233.808333333332},
 {'_id': {'name': 'Auto Chess'},
  'avgViewers': 3473.3145833333333,
  'avgChannels': 102.01666666666667,
  'avgPopularity': 3548.9583333333335},
 {'_id': {'name': 'Call of Duty: Advanced Warfare'},
  'avgViewers': 8.268436578171091,
  'avgChannels': 3.017699115044248,
  'avgPopularity': 7.899705014749262},
 {'_id': {'name': 'Call of Duty: Black Ops'},
  'avgViewers': 81.78646934460888,
  'avgChannels': 12.930232558139535,
  'avgPopularity': 83.1416490486258},
 {'_id': {'name': 'Call of Duty: Black Ops 4'},
  'avgViewers': 10145.0125,
  'avgChannels': 1122.56875,
  'avgPopularity': 10217.6875}]

### Game with highest average viewers for each day

In [38]:
highest_avgView_per_day_pipe = [
    {"$unwind": "$data"},
    { "$group": {
                "_id": {"name": "$data.game_name",
                        "day" :   { "$substr" : ["$timestamp", 8, 2 ] },
                        "month" : { "$substr" : ["$timestamp", 5, 2 ] },         
                        "year" :  { "$substr" : ["$timestamp", 0, 4 ] }, },                        
                "avgViewers": { "$avg": '$data.viewers'}
            }},
    {"$sort": {"avgViewers": -1}},
    {"$group": {"_id": {
        "day": "$_id.day",
        "month": "$_id.month",
        "year": "$_id.year" },    
                "max_views": {"$first": "$avgViewers"},
                "game": {"$first": "$_id.name"}}},
    {"$project": {
            "_id": 0,
            "day": "$_id.day",
            "month": "$_id.month",
            "year": "$_id.year",
            "max_views": 1,
            "game": 1} },
    {"$sort": {"day": 1}}]

In [39]:
highest_avgView_per_day = list(db.twitch.aggregate(highest_avgView_per_day_pipe))

In [40]:
highest_avgView_per_day[:5]

[{'max_views': 139492.28373702423,
  'game': 'League of Legends',
  'day': '13',
  'month': '06',
  'year': '2019'},
 {'max_views': 138835.44166666668,
  'game': 'League of Legends',
  'day': '14',
  'month': '06',
  'year': '2019'},
 {'max_views': 179458.00625,
  'game': 'League of Legends',
  'day': '15',
  'month': '06',
  'year': '2019'},
 {'max_views': 165168.76617954072,
  'game': 'League of Legends',
  'day': '16',
  'month': '06',
  'year': '2019'},
 {'max_views': 131107.91666666666,
  'game': 'Fortnite',
  'day': '17',
  'month': '06',
  'year': '2019'}]

We could write a very similar query to find, for each day, the game with the highest average popularity or number of channels.

### Hourly statistics for every game streamed on twitch

In [41]:
all_twitch_hourly_avg_pipe = [{"$unwind": "$data"},
                              { "$group": {"_id":
                                           {"name": "$data.game_name",
                                            "year" :  { "$substr" : ["$timestamp", 0, 4 ] }, 
                                            "month" : { "$substr" : ["$timestamp", 5, 2 ] },     
                                            "day" :   { "$substr" : ["$timestamp", 8, 2 ] },
                                            "hour": { "$substr" : ["$timestamp", 11, 2 ] } },
                                           "avgViewers": { "$avg": '$data.viewers'}
                                           }},
                              {"$sort": {"_id": 1} }]

In [42]:
all_twitch_hourly_avg = list(db.twitch.aggregate(all_twitch_hourly_avg_pipe, allowDiskUse = True))

It is easier to visualize the result as a dataframe:

In [43]:
all_twitch_hourly_avg_df = json_normalize(all_twitch_hourly_avg)

In [44]:
all_twitch_hourly_avg_df = all_twitch_hourly_avg_df.rename(index=str, columns={"_id.name": "name", "_id.hour": "hour",
                                   "_id.day": "day","_id.month": "month","_id.year": "year"})
all_twitch_hourly_avg_df = all_twitch_hourly_avg_df[['name','day', 'month', 'year', 'hour', 'avgViewers']]

In [45]:
all_twitch_hourly_avg_df['hourly_top'] = all_twitch_hourly_avg_df.sort_values(['day', 'month', 'year', 'hour', 'avgViewers'],
                               ascending=[True, True, True, True, False]).groupby(['day', 'month', 'year', 'hour']).cumcount()
all_twitch_hourly_avg_df['hourly_top'] = all_twitch_hourly_avg_df['hourly_top'] + 1

In [46]:
all_twitch_hourly_avg_df.head()

Unnamed: 0,name,day,month,year,hour,avgViewers,hourly_top
0,8 To Glory - The Official Game of the PBR,14,6,2019,18,1.0,2206
1,8 To Glory - The Official Game of the PBR,22,6,2019,5,5.0,1250
2,F1 2019,20,6,2019,17,10.0,1034
3,F1 2019,20,6,2019,18,24.842105,726
4,F1 2019,20,6,2019,19,29.25,727


In [47]:
all_twitch_hourly_avg_df[all_twitch_hourly_avg_df['name'] == 'Fortnite'].head()

Unnamed: 0,name,day,month,year,hour,avgViewers,hourly_top
250626,Fortnite,13,6,2019,9,62169.7,3
250627,Fortnite,13,6,2019,10,59070.947368,4
250628,Fortnite,13,6,2019,11,62658.25,4
250629,Fortnite,13,6,2019,12,71340.75,3
250630,Fortnite,13,6,2019,13,73720.1,4


In [48]:
all_twitch_hourly_avg_df[all_twitch_hourly_avg_df['name'] == 'Tetris'].head()

Unnamed: 0,name,day,month,year,hour,avgViewers,hourly_top
657065,Tetris,13,6,2019,9,14.333333,533
657066,Tetris,13,6,2019,10,13.578947,570
657067,Tetris,13,6,2019,11,4.111111,974
657068,Tetris,13,6,2019,12,11.3,711
657069,Tetris,13,6,2019,13,23.052632,589


# Queries on Twitter

In [19]:
with open("../top_50_games_to_monitor.json", "r") as f:
    game_list = json.load(f)
    tweet_top_games = game_list["selected_top_50_games"]["norm_name"]

tweet_top_games[:5]

['death stranding',
 'new super mario bros. wii',
 'pokémon red/blue',
 'mario kart ds',
 'grand theft auto iv']

### Daily total retweets, likes, and tweet count for each game in the top_games list

In [50]:
tweet_daily_pipe = [{"$match" : { "query" : { "$in" : tweet_top_games},  }}, 
            { "$group": {
                "_id": {"name": "$query",
                        "day": { "$dayOfMonth": "$date" },
                        "month": { "$month": "$date" },
                        "year": { "$year": "$date" }},
                "tot_retweets": { "$sum": "$retweets" },
                "tot_likes": { "$sum" : "$likes"},
                "tweet_count" : { "$sum": 1 }
            }},
            {"$sort": {"_id": 1} }]

In [51]:
tweet_daily = list(db.twitter.aggregate(tweet_daily_pipe))

In [52]:
tweet_daily[0:4]

[{'_id': {'name': 'apex legends', 'day': 13, 'month': 6, 'year': 2019},
  'tot_retweets': 54,
  'tot_likes': 349,
  'tweet_count': 96},
 {'_id': {'name': 'apex legends', 'day': 14, 'month': 6, 'year': 2019},
  'tot_retweets': 140,
  'tot_likes': 1022,
  'tweet_count': 200},
 {'_id': {'name': 'apex legends', 'day': 15, 'month': 6, 'year': 2019},
  'tot_retweets': 160,
  'tot_likes': 1154,
  'tweet_count': 200},
 {'_id': {'name': 'apex legends', 'day': 16, 'month': 6, 'year': 2019},
  'tot_retweets': 173,
  'tot_likes': 1491,
  'tweet_count': 169}]

In [53]:
tweet_daily_df = json_normalize(tweet_daily)
tweet_daily_df = tweet_daily_df.rename(index=str, columns={"_id.name": "name", "_id.day": "day","_id.month": "month","_id.year": "year"})
tweet_daily_df = tweet_daily_df[['name', 'day', 'month', 'year', 'tot_retweets', 'tot_likes', 'tweet_count']]
tweet_daily_df.head()

Unnamed: 0,name,day,month,year,tot_retweets,tot_likes,tweet_count
0,apex legends,13,6,2019,54,349,96
1,apex legends,14,6,2019,140,1022,200
2,apex legends,15,6,2019,160,1154,200
3,apex legends,16,6,2019,173,1491,169
4,apex legends,17,6,2019,112,692,16


In [54]:
tweet_daily_df[tweet_daily_df['name'] == 'tetris']

Unnamed: 0,name,day,month,year,tot_retweets,tot_likes,tweet_count
495,tetris,13,6,2019,28,282,200
496,tetris,14,6,2019,726,4310,76
497,tetris,15,6,2019,60,597,200
498,tetris,16,6,2019,143,903,200
499,tetris,17,6,2019,175,830,200
500,tetris,18,6,2019,360,1830,110
501,tetris,19,6,2019,317,1547,200
502,tetris,20,6,2019,764,11352,200
503,tetris,21,6,2019,188,1452,200
504,tetris,22,6,2019,38,305,29


### Game with the highest retweet count for each day:

In [55]:
pipeline_highest_retweet_per_day = [{
    "$group": {"_id": {"name": "$query",
                       "day": {"$dayOfMonth": "$date"},
                       "month": {"$month": "$date"},
                       "year": {"$year": "$date"} },
               "tot_retweets": { "$sum": "$retweets"}} },
    {"$sort": {"tot_retweets": -1} },
    {"$group": {"_id": {"day": "$_id.day",
                        "month": "$_id.month",
                        "year": "$_id.year"},
                "max_retweets": {"$first": "$tot_retweets"},
                "game": {"$first": "$_id.name"} } },
    {"$project": {"_id": 0,
                  "day": "$_id.day",
                  "month": "$_id.month",
                  "year": "$_id.year",
                  "max_retweets": 1,
                  "game": 1 } },
    {"$sort": { "day": 1}}]

In [56]:
highest_retweet_per_day = list(db.twitter.aggregate(pipeline_highest_retweet_per_day))
highest_retweet_per_day[:5]

[{'max_retweets': 419,
  'game': 'fortnite',
  'day': 13,
  'month': 6,
  'year': 2019},
 {'max_retweets': 1878,
  'game': 'fortnite',
  'day': 14,
  'month': 6,
  'year': 2019},
 {'max_retweets': 9617,
  'game': 'pokémon gold/silver',
  'day': 15,
  'month': 6,
  'year': 2019},
 {'max_retweets': 3365, 'game': 'slots', 'day': 16, 'month': 6, 'year': 2019},
 {'max_retweets': 6082,
  'game': 'overwatch',
  'day': 17,
  'month': 6,
  'year': 2019}]

### Game with the highest likes for each day:

In [57]:
pipeline_highest_likes_per_day = [{
    "$group": {"_id": {"name": "$query",
                       "day": {"$dayOfMonth": "$date"},
                       "month": {"$month": "$date"},
                       "year": {"$year": "$date"} },
               "tot_likes": { "$sum": "$likes"}} },
    {"$sort": {"tot_likes": -1} },
    {"$group": {"_id": {"day": "$_id.day",
                        "month": "$_id.month",
                        "year": "$_id.year"},
                "max_likes": {"$first": "$tot_likes"},
                "game": {"$first": "$_id.name"} } },
    {"$project": {"_id": 0,
                  "day": "$_id.day",
                  "month": "$_id.month",
                  "year": "$_id.year",
                  "max_likes": 1,
                  "game": 1 } },
    {"$sort": { "day": 1}}]

In [58]:
highest_likes_per_day = list(db.twitter.aggregate(pipeline_highest_likes_per_day))
highest_likes_per_day

[{'max_likes': 7344, 'game': 'fortnite', 'day': 13, 'month': 6, 'year': 2019},
 {'max_likes': 34285, 'game': 'fortnite', 'day': 14, 'month': 6, 'year': 2019},
 {'max_likes': 48678,
  'game': 'pokémon gold/silver',
  'day': 15,
  'month': 6,
  'year': 2019},
 {'max_likes': 33085, 'game': 'fortnite', 'day': 16, 'month': 6, 'year': 2019},
 {'max_likes': 35726, 'game': 'fortnite', 'day': 17, 'month': 6, 'year': 2019},
 {'max_likes': 53061, 'game': 'fortnite', 'day': 18, 'month': 6, 'year': 2019},
 {'max_likes': 15202, 'game': 'fortnite', 'day': 19, 'month': 6, 'year': 2019},
 {'max_likes': 30358, 'game': 'fortnite', 'day': 20, 'month': 6, 'year': 2019},
 {'max_likes': 22664, 'game': 'fortnite', 'day': 21, 'month': 6, 'year': 2019},
 {'max_likes': 43675,
  'game': 'minecraft',
  'day': 22,
  'month': 6,
  'year': 2019},
 {'max_likes': 25375,
  'game': 'minecraft',
  'day': 23,
  'month': 6,
  'year': 2019},
 {'max_likes': 39009, 'game': 'fortnite', 'day': 24, 'month': 6, 'year': 2019},
 {'m

### Which is the day with the highest likes? And which is the one with the lowest values?

In [59]:
pipeline_highest_likes = [{
    "$group": {"_id": {"day": {"$dayOfMonth": "$date"},
                       "month": {"$month": "$date"},
                       "year": {"$year": "$date"} },
               "tot_likes": { "$sum": "$likes"}} },
    {"$sort": {"tot_likes": -1} },
    {"$group": {"_id": {"day": "$_id.day",
                        "month": "$_id.month",
                        "year": "$_id.year"},
                "max_likes": {"$first": "$tot_likes"}}},
    {"$project": {"_id": 0,
                  "day": "$_id.day",
                  "month": "$_id.month",
                  "year": "$_id.year",
                  "max_likes": 1} },
    {"$sort": { "max_likes": -1}}]

In [60]:
highest_likes = list(db.twitter.aggregate(pipeline_highest_likes))
print('Day with highest likes: \n\t', highest_likes[0],
      '\n\nDay with lowest likes: \n\t', highest_likes[-1])

Day with highest likes: 
	 {'max_likes': 129262, 'day': 15, 'month': 6, 'year': 2019} 

Day with lowest likes: 
	 {'max_likes': 15299, 'day': 26, 'month': 6, 'year': 2019}


# Total "buzz"

We are interested in using both Twitch and Twitter data for determing which game has the highest "buzz", that is for us the sum of the average views on Twitch and the total retweets and likes on Twitter.

In [20]:
pip = [{"$match" : { "query" : { "$in" : tweet_top_games},  }},
       {"$group": {"_id": {"name": "$query",
                          "day": {"$dayOfMonth": "$date"},
                          "month": {"$month": "$date"},
                          "year": {"$year": "$date"}},
        "total_retweets": {"$sum": "$retweets"},
        "total_likes": {"$sum": "$likes"}}},
       {"$project": {"_id": 0,
                     "norm_name": "$_id.name",
                     "day": "$_id.day",
                     "month": "$_id.month",
                     "year": "$_id.year",
                     "total_retweets": 1,
                     "total_likes": 1}},
       {"$sort": {"day": 1}}]

In [21]:
tweet_stats_query_results = list(db.twitter.aggregate(pip))
tweet_stats_df = pd.DataFrame.from_records(tweet_stats_query_results)
tweet_stats_df.head()

Unnamed: 0,day,month,norm_name,total_likes,total_retweets,year
0,13,6,pokémon gold/silver,16,1,2019
1,13,6,hearthstone,588,37,2019
2,13,6,grand theft auto san andreas,1,2,2019
3,13,6,fortnite,7344,419,2019
4,13,6,playerunknown's battlegrounds,20,4,2019


In [53]:
server.stop()

In [22]:
twitch_stats_df.head()

Unnamed: 0,name,norm_name,day,month,year,avgViewers,avgPopularity,avgChannels
0,Apex Legends,apex legends,13,6,2019,17491.647059,17115.401384,1434.076125
1,Apex Legends,apex legends,14,6,2019,15606.502083,15777.739583,1634.522917
2,Apex Legends,apex legends,15,6,2019,15781.329167,15442.404167,1643.108333
3,Apex Legends,apex legends,16,6,2019,18600.824635,18946.350731,1656.649269
4,Apex Legends,apex legends,17,6,2019,19137.74375,18741.535417,1555.7125


**Let's join *tweet_stats_df* and *twitch_stats_df***

In [23]:
twitch_stats_df.day = twitch_stats_df.day.astype(int)
twitch_stats_df.month = twitch_stats_df.month.astype(int)
twitch_stats_df.year = twitch_stats_df.year.astype(int)

In [24]:
combined_stats = pd.merge(tweet_stats_df, twitch_stats_df, on = ['norm_name', 'day', 'month', 'year'], how="outer")

# replace some twitch missing values with 0: if the value is missing, the game was not streamed
combined_stats[['avgViewers', 'avgPopularity', 'avgChannels']] = combined_stats[['avgViewers', 'avgPopularity', 'avgChannels']].fillna(0)

combined_stats.head()

Unnamed: 0,day,month,norm_name,total_likes,total_retweets,year,name,avgViewers,avgPopularity,avgChannels
0,13,6,pokémon gold/silver,16.0,1.0,2019,Pokémon Gold/Silver,8.733813,7.705036,1.942446
1,13,6,hearthstone,588.0,37.0,2019,Hearthstone,21380.67128,21350.15917,225.629758
2,13,6,grand theft auto san andreas,1.0,2.0,2019,Grand Theft Auto: San Andreas,455.910035,436.757785,28.186851
3,13,6,fortnite,7344.0,419.0,2019,Fortnite,117869.685121,116048.532872,7186.041522
4,13,6,playerunknown's battlegrounds,20.0,4.0,2019,PLAYERUNKNOWN'S BATTLEGROUNDS,40519.429066,40224.128028,1299.048443


### Computing total buzz

#### Buzz as sum of total likes, total retweets and average viewers

In [25]:
combined_stats['buzz'] = combined_stats.total_likes + combined_stats.total_retweets + combined_stats.avgViewers
combined_stats.head()

Unnamed: 0,day,month,norm_name,total_likes,total_retweets,year,name,avgViewers,avgPopularity,avgChannels,buzz
0,13,6,pokémon gold/silver,16.0,1.0,2019,Pokémon Gold/Silver,8.733813,7.705036,1.942446,25.733813
1,13,6,hearthstone,588.0,37.0,2019,Hearthstone,21380.67128,21350.15917,225.629758,22005.67128
2,13,6,grand theft auto san andreas,1.0,2.0,2019,Grand Theft Auto: San Andreas,455.910035,436.757785,28.186851,458.910035
3,13,6,fortnite,7344.0,419.0,2019,Fortnite,117869.685121,116048.532872,7186.041522,125632.685121
4,13,6,playerunknown's battlegrounds,20.0,4.0,2019,PLAYERUNKNOWN'S BATTLEGROUNDS,40519.429066,40224.128028,1299.048443,40543.429066


Data from twitch and twitter have very different scales:

In [26]:
combined_stats['total_retweets'].describe()

count     642.000000
mean      237.651090
std       851.032234
min         0.000000
25%         6.000000
50%        33.000000
75%       132.750000
max      9617.000000
Name: total_retweets, dtype: float64

In [27]:
combined_stats['total_likes'].describe()

count      642.000000
mean      1875.987539
std       5970.712134
min          0.000000
25%         46.000000
50%        264.000000
75%        978.000000
max      53061.000000
Name: total_likes, dtype: float64

In [28]:
combined_stats['avgViewers'].describe()

count       742.000000
mean      15835.136265
std       30231.740329
min           0.000000
25%          21.419245
50%         439.955017
75%       16479.355894
max      181664.022917
Name: avgViewers, dtype: float64

#### Buzz as weighted sum of standardized total retweets, likes, and average views

Just taking their sum might give the avgViewers data too much weight... we standardize the columns before computing the buzz.

In [29]:
rtw_mu = np.mean(combined_stats['total_retweets'])
rtw_sd = np.std(combined_stats['total_retweets'])
combined_stats['std_tot_rtw'] = (combined_stats['total_retweets'] - rtw_mu) / rtw_sd

like_mu = np.mean(combined_stats['total_likes'])
like_sd = np.std(combined_stats['total_likes'])

combined_stats['std_tot_likes'] = (combined_stats['total_likes'] - like_mu) / like_sd

vw_mu = np.mean(combined_stats['avgViewers'])
vw_sd = np.std(combined_stats['avgViewers'])

combined_stats['std_avgViewers'] = (combined_stats['avgViewers'] - vw_mu) / vw_sd

Now that the features are standardized, we combine them in order to create the buzz feature. Since data from twitch and twitter should have the same weight, we create the new feature as a weighted sum.

In [30]:
weight_rtw = 0.25
weight_likes = 0.25
weight_twitch = 0.5

In [31]:
combined_stats['std_buzz'] = (weight_likes*combined_stats.std_tot_likes
                              + weight_rtw*combined_stats.std_tot_rtw
                              + weight_twitch*combined_stats.std_avgViewers)

In [32]:
combined_stats[['name','day', 'month', 'year', 'std_avgViewers', 'std_tot_likes', 'std_tot_rtw', 'buzz', 'std_buzz']].head()

Unnamed: 0,name,day,month,year,std_avgViewers,std_tot_likes,std_tot_rtw,buzz,std_buzz
0,Pokémon Gold/Silver,13,6,2019,-0.523856,-0.311761,-0.278292,25.733813,-0.409441
1,Hearthstone,13,6,2019,0.183558,-0.215886,-0.235958,22005.67128,-0.021182
2,Grand Theft Auto: San Andreas,13,6,2019,-0.509054,-0.314276,-0.277116,458.910035,-0.402375
3,Fortnite,13,6,2019,3.377357,0.91652,0.213259,125632.685121,1.971123
4,PLAYERUNKNOWN'S BATTLEGROUNDS,13,6,2019,0.817053,-0.311091,-0.274764,40543.429066,0.262063


#### Buzz as weighted sum of normalized total retweets, likes, and average views

In [33]:
rtw_min = np.min(combined_stats['total_retweets'])
rtw_max = np.max(combined_stats['total_retweets'])

combined_stats['norm_tot_rtw'] = (combined_stats['total_retweets'] - rtw_min) / (rtw_max - rtw_min)

like_min = np.min(combined_stats['total_likes'])
like_max = np.max(combined_stats['total_likes'])

combined_stats['norm_tot_likes'] = (combined_stats['total_likes'] - like_min) / (like_max - like_min)

vw_min = np.min(combined_stats['avgViewers'])
vw_max = np.max(combined_stats['avgViewers'])

combined_stats['norm_avgViewers'] = (combined_stats['avgViewers'] - vw_min) / (vw_max - vw_min)

Now that the features are in the same scale, we combine them in order to create the buzz feature. Since data from twitch and twitter should have the same weight, we create the new feature as a weighted sum.

In [34]:
weight_rtw = 0.25
weight_likes = 0.25
weight_twitch = 0.5

In [35]:
combined_stats['norm_buzz'] = (weight_likes*combined_stats.norm_tot_likes
                               + weight_rtw*combined_stats.norm_tot_rtw
                               + weight_twitch*combined_stats.norm_avgViewers)

In [36]:
combined_stats[['name','day', 'month', 'year', 'std_avgViewers', 'std_tot_likes', 'std_tot_rtw', 'buzz', 'std_buzz', 'norm_buzz']].head()

Unnamed: 0,name,day,month,year,std_avgViewers,std_tot_likes,std_tot_rtw,buzz,std_buzz,norm_buzz
0,Pokémon Gold/Silver,13,6,2019,-0.523856,-0.311761,-0.278292,25.733813,-0.409441,0.000125
1,Hearthstone,13,6,2019,0.183558,-0.215886,-0.235958,22005.67128,-0.021182,0.062579
2,Grand Theft Auto: San Andreas,13,6,2019,-0.509054,-0.314276,-0.277116,458.910035,-0.402375,0.001312
3,Fortnite,13,6,2019,3.377357,0.91652,0.213259,125632.685121,1.971123,0.369911
4,PLAYERUNKNOWN'S BATTLEGROUNDS,13,6,2019,0.817053,-0.311091,-0.274764,40543.429066,0.262063,0.111721


## In which day a game obtained the highest "buzz"?

In [37]:
combined_stats.loc[combined_stats['buzz'].idxmax()]

day                      21
month                     6
norm_name          fortnite
total_likes           22664
total_retweets         2977
year                   2019
name               Fortnite
avgViewers           181664
avgPopularity        180342
avgChannels         7788.63
buzz                 207305
std_tot_rtw         3.22136
std_tot_likes       3.48438
std_avgViewers      5.48896
std_buzz            4.42091
norm_tot_rtw       0.309556
norm_tot_likes     0.427131
norm_avgViewers           1
norm_buzz          0.684172
Name: 391, dtype: object

In [38]:
combined_stats.loc[combined_stats['std_buzz'].idxmax()]

day                      18
month                     6
norm_name          fortnite
total_likes           53061
total_retweets         8851
year                   2019
name               Fortnite
avgViewers           105128
avgPopularity        104166
avgChannels         7266.33
buzz                 167040
std_tot_rtw          10.129
std_tot_likes       8.57937
std_avgViewers      2.95559
std_buzz            6.15488
norm_tot_rtw       0.920349
norm_tot_likes            1
norm_avgViewers    0.578693
norm_buzz          0.769434
Name: 232, dtype: object

In [39]:
combined_stats.loc[combined_stats['norm_buzz'].idxmax()]

day                      25
month                     6
norm_name          fortnite
total_likes           52761
total_retweets         6474
year                   2019
name               Fortnite
avgViewers           133444
avgPopularity        132994
avgChannels         8345.04
buzz                 192679
std_tot_rtw          7.3337
std_tot_likes       8.52908
std_avgViewers      3.89286
std_buzz            5.91213
norm_tot_rtw       0.673183
norm_tot_likes     0.994346
norm_avgViewers    0.734564
norm_buzz          0.784164
Name: 590, dtype: object

## In which day a game obtained the lowest "buzz"?

In [40]:
combined_stats.loc[combined_stats['buzz'].idxmin()]

day                           13
month                          6
norm_name          mario kart ds
total_likes                    0
total_retweets                 0
year                        2019
name                         NaN
avgViewers                     0
avgPopularity                  0
avgChannels                    0
buzz                           0
std_tot_rtw            -0.279468
std_tot_likes          -0.314443
std_avgViewers         -0.524145
std_buzz                -0.41055
norm_tot_rtw                   0
norm_tot_likes                 0
norm_avgViewers                0
norm_buzz                      0
Name: 36, dtype: object

In [41]:
combined_stats.loc[combined_stats['std_buzz'].idxmin()]

day                           13
month                          6
norm_name          mario kart ds
total_likes                    0
total_retweets                 0
year                        2019
name                         NaN
avgViewers                     0
avgPopularity                  0
avgChannels                    0
buzz                           0
std_tot_rtw            -0.279468
std_tot_likes          -0.314443
std_avgViewers         -0.524145
std_buzz                -0.41055
norm_tot_rtw                   0
norm_tot_likes                 0
norm_avgViewers                0
norm_buzz                      0
Name: 36, dtype: object

In [42]:
combined_stats.loc[combined_stats['norm_buzz'].idxmin()]

day                           13
month                          6
norm_name          mario kart ds
total_likes                    0
total_retweets                 0
year                        2019
name                         NaN
avgViewers                     0
avgPopularity                  0
avgChannels                    0
buzz                           0
std_tot_rtw            -0.279468
std_tot_likes          -0.314443
std_avgViewers         -0.524145
std_buzz                -0.41055
norm_tot_rtw                   0
norm_tot_likes                 0
norm_avgViewers                0
norm_buzz                      0
Name: 36, dtype: object

## Which is the game with the highest cumulative "buzz"?

In [43]:
cumulative_buzz = combined_stats.groupby("norm_name")['buzz'].sum().reset_index()
cumulative_buzz.loc[cumulative_buzz['buzz'].idxmax()]

norm_name       fortnite
buzz         2.15024e+06
Name: 17, dtype: object

In [44]:
std_cumulative_buzz = combined_stats.groupby("norm_name")['std_buzz'].sum().reset_index()
std_cumulative_buzz.loc[std_cumulative_buzz['std_buzz'].idxmax()]

norm_name    fortnite
std_buzz      50.7709
Name: 17, dtype: object

In [45]:
norm_cumulative_buzz = combined_stats.groupby("norm_name")['norm_buzz'].sum().reset_index()
norm_cumulative_buzz.loc[norm_cumulative_buzz['norm_buzz'].idxmax()]

norm_name    fortnite
norm_buzz     7.60925
Name: 17, dtype: object

## Which is the game with the lowest cumulative "buzz"?

In [46]:
cumulative_buzz.loc[cumulative_buzz['buzz'].idxmin()]

norm_name    fifa xix
buzz                0
Name: 16, dtype: object

In [47]:
std_cumulative_buzz.loc[std_cumulative_buzz['std_buzz'].idxmin()]

norm_name    new super mario bros. wii
std_buzz                      -5.71806
Name: 32, dtype: object

In [48]:
norm_cumulative_buzz.loc[norm_cumulative_buzz['norm_buzz'].idxmin()]

norm_name    fifa xix
norm_buzz           0
Name: 16, dtype: object

#### Games sorted by the three variants of buzz

In [49]:
d1 = cumulative_buzz.sort_values('buzz', ascending = False).reset_index(drop = True)

In [50]:
d2 = std_cumulative_buzz.sort_values('std_buzz', ascending = False).reset_index(drop = True)

In [51]:
d3 = norm_cumulative_buzz.sort_values('norm_buzz', ascending = False).reset_index(drop = True)

In [52]:
df = d1.merge(d2, left_index = True, right_index = True).merge(d3, left_index = True, right_index = True)
df = df.rename(columns = {'norm_name_x' : "buzz_order", "norm_name_y" : "std_buzz_order", "norm_name" : "norm_buzz_order" })
df = df[['buzz_order', 'std_buzz_order', 'norm_buzz_order']]
df

Unnamed: 0,buzz_order,std_buzz_order,norm_buzz_order
0,fortnite,fortnite,fortnite
1,league of legends,league of legends,league of legends
2,just chatting,just chatting,just chatting
3,grand theft auto v,grand theft auto v,grand theft auto v
4,dota ii,minecraft,overwatch
5,overwatch,overwatch,dota ii
6,counter-strike global offensive,dota ii,minecraft
7,minecraft,slots,counter-strike global offensive
8,world of warcraft,counter-strike global offensive,world of warcraft
9,playerunknown's battlegrounds,world of warcraft,slots
