In [1]:
from bson.son import SON
import json
from pymongo import MongoClient
from bson.objectid import ObjectId
import pandas as pd
from pandas.io.json import json_normalize
from datetime import datetime

## Connect to remote mongodb

In [29]:
from sshtunnel import SSHTunnelForwarder
import pymongo

In [30]:
with open('../keys.json') as k:
    keys = json.loads(k.read())
    MONGO_HOST = keys["SSH"]["MONGO_HOST"] # i.e. 10.9.13.14
    MONGO_DB = keys["SSH"]["MONGO_DB"] # i.e. dm_project 
    MONGO_USER = keys["SSH"]["MONGO_USER"] # i.e. studente
    MONGO_PASS = keys["SSH"]["MONGO_PASS"] # i.e. la password della vm

In [31]:
server = SSHTunnelForwarder(
    MONGO_HOST,
    ssh_username=MONGO_USER,
    ssh_password=MONGO_PASS,
    remote_bind_address=('127.0.0.1', 27017)
)

In [32]:
server.start() # remember to stop

In [33]:
client = pymongo.MongoClient('127.0.0.1', server.local_bind_port) # server.local_bind_port is assigned local port

In [34]:
db = client[MONGO_DB]

In [35]:
db.list_collection_names()

['system.indexes', 'twitch', 'twitter']

# Integration of previous locally dowloaded data. 
### Script on VM started on 23/06/2019

In [9]:
# with open('20190613_0932_data.json', "r", encoding = 'utf8') as json_file:
#     inserted_ids = []
#     for line in json_file:
#         data = json.loads(line)
#         post_id = db.twitch.insert_one(data).inserted_id
#         inserted_ids.append(post_id)

# Count of documents in each collection

## Twitch

In [10]:
db.twitch.count()

5857

## Twitter

In [11]:
db.twitter.count()

68900

# Querying Twitch

In [38]:
pipeline = [{"$project":
             {'_id': 0,
              'time':'$timestamp'} }]

In [39]:
tsmps = list(db.twitch.aggregate(pipeline))

In [40]:
len(tsmps)

6335

In [41]:
start = tsmps[3]['time'] # this is the oldest timestamp
end = tsmps[-1]['time'] # this is the most recent one
start = datetime.strptime(start, "%Y-%m-%d %H:%M:%S.%f")
end = datetime.strptime(end, "%Y-%m-%d %H:%M:%S.%f")

start, end

(datetime.datetime(2019, 6, 13, 9, 32, 34, 348514),
 datetime.datetime(2019, 6, 26, 14, 25, 34, 77411))

In [42]:
(end-start).total_seconds() / 180

6337.665160538889

The following is the list of games we are interested in, so that we can restrict the queries on them if we want.

In [17]:
"""top_games = ['Apex Legends', 'Auto Chess', 'Bio Inc. Redemption', 'Call of Duty: Advanced Warfare',
             'Call of Duty: Black Ops', 'Call of Duty: Black Ops 4',
             'Call of Duty: Black Ops II', 'Call of Duty: Black Ops III',
             'Call of Duty: Ghosts', 'Call of Duty: Modern Warfare 3',
             'Clam Man', 'Counter-Strike: Global Offensive',
             'Dead by Daylight', 'Death Stranding', 'Dota 2', 'Duck Hunt', 'FIFA 19', 'Fortnite',
             'Grand Theft Auto IV', 'Grand Theft Auto V', 'Grand Theft Auto: San Andreas', 'Hearthstone',
             'Just Chatting', 'Layers of Fear 2', 'League of Legends', 'Magic: The Gathering',
             'Mario Kart DS', 'Mario Kart Wii', 'Minecraft', 'Modern Warfare 2',
             'Music & Performing Arts', 'New Super Mario Bros.',
             'New Super Mario Bros. Wii', 'Overwatch',
             "PLAYERUNKNOWN'S BATTLEGROUNDS", 'Pokémon Gold/Silver',
             'Pokémon Red/Blue', 'Slots', 'Splitgate: Arena Warfare', 'Super Mario Bros.',
             'Super Mario World', 'Talk Shows & Podcasts', 'Tetris', 'The Elder Scrolls V: Skyrim',
             "Tom Clancy's Rainbow Six: Siege", 'Total War: Three Kingdoms',
             'Wii Play', 'Wii Sports', 'Wii Sports Resort', 'World of Warcraft']"""

'top_games = [\'Apex Legends\', \'Auto Chess\', \'Bio Inc. Redemption\', \'Call of Duty: Advanced Warfare\',\n             \'Call of Duty: Black Ops\', \'Call of Duty: Black Ops 4\',\n             \'Call of Duty: Black Ops II\', \'Call of Duty: Black Ops III\',\n             \'Call of Duty: Ghosts\', \'Call of Duty: Modern Warfare 3\',\n             \'Clam Man\', \'Counter-Strike: Global Offensive\',\n             \'Dead by Daylight\', \'Death Stranding\', \'Dota 2\', \'Duck Hunt\', \'FIFA 19\', \'Fortnite\',\n             \'Grand Theft Auto IV\', \'Grand Theft Auto V\', \'Grand Theft Auto: San Andreas\', \'Hearthstone\',\n             \'Just Chatting\', \'Layers of Fear 2\', \'League of Legends\', \'Magic: The Gathering\',\n             \'Mario Kart DS\', \'Mario Kart Wii\', \'Minecraft\', \'Modern Warfare 2\',\n             \'Music & Performing Arts\', \'New Super Mario Bros.\',\n             \'New Super Mario Bros. Wii\', \'Overwatch\',\n             "PLAYERUNKNOWN\'S BATTLEGROUNDS"

Alternative (and I think more elegant) way to do it:

In [44]:
with open("../top_50_games_to_monitor.json", "r") as f:
    game_list = json.load(f)

    top_games = game_list["selected_top_50_games"]["name"]
top_games

['Fortnite',
 'Call of Duty: Black Ops II',
 'Grand Theft Auto: San Andreas',
 'Clam Man',
 'Wii Play',
 'Modern Warfare 2',
 'Super Mario Bros.',
 'Apex Legends',
 'Duck Hunt',
 'Dead by Daylight',
 'Super Mario World',
 'Call of Duty: Modern Warfare 3',
 'Call of Duty: Ghosts',
 'Grand Theft Auto V',
 'Call of Duty: Black Ops',
 'Death Stranding',
 'Bio Inc. Redemption',
 'Hearthstone',
 'Slots',
 'Just Chatting',
 'Mario Kart Wii',
 'Total War: Three Kingdoms',
 'Mario Kart DS',
 'League of Legends',
 'Magic: The Gathering',
 'Overwatch',
 'FIFA 19',
 'Wii Sports',
 'Call of Duty: Black Ops 4',
 'Pokémon Gold/Silver',
 'World of Warcraft',
 "Tom Clancy's Rainbow Six: Siege",
 'Counter-Strike: Global Offensive',
 'The Elder Scrolls V: Skyrim',
 'Minecraft',
 "PLAYERUNKNOWN'S BATTLEGROUNDS",
 'Dota 2',
 'Call of Duty: Black Ops III',
 'Layers of Fear 2',
 'Tetris',
 'Call of Duty: Advanced Warfare',
 'Splitgate: Arena Warfare',
 'Wii Sports Resort',
 'Talk Shows & Podcasts',
 'Pokémon

Average daily views for each game:

In [58]:
pipeline = [{"$unwind": "$data"},
            {"$match" : { "data.game_name" : { "$in" : top_games },  }},
            { "$group": {
                "_id": {"name": "$data.game_name",
                        "norm_name": "$data.game_norm_name",
                        "year" :  { "$substr" : ["$timestamp", 0, 4 ] }, 
                        "month" : { "$substr" : ["$timestamp", 5, 2 ] },     
                        "day" :   { "$substr" : ["$timestamp", 8, 2 ] } },
                "avgViewers": { "$avg": '$data.viewers'}
            }},
            {"$sort": {"_id": 1} }]

In [59]:
# this is pretty slow
from datetime import datetime
start = datetime.now()
result = list(db.twitch.aggregate(pipeline))
end = datetime.now()
end-start

datetime.timedelta(0, 37, 405598)

The following are the games initially chosen as top games, but that have not been collected again on twitch.

In [47]:
result_set = set([elem['_id']['name'] for elem in result])
top_set = set(top_games)

top_set - result_set

{'Clam Man'}

In [70]:
twitch_stats_df = json_normalize(result)
twitch_stats_df = twitch_stats_df.rename(index=str, columns={"_id.name": "name", "_id.norm_name": "norm_name", "_id.day": "day","_id.month": "month","_id.year": "year"})
twitch_stats_df = twitch_stats_df[['name', "norm_name", 'day', 'month', 'year', 'avgViewers']]

twitch_stats_df.head(20)

Unnamed: 0,name,norm_name,day,month,year,avgViewers
0,Apex Legends,apex legends,13,6,2019,17491.647059
1,Apex Legends,apex legends,14,6,2019,15606.502083
2,Apex Legends,apex legends,15,6,2019,15781.329167
3,Apex Legends,apex legends,16,6,2019,18600.824635
4,Apex Legends,apex legends,17,6,2019,19137.74375
5,Apex Legends,apex legends,18,6,2019,19605.116667
6,Apex Legends,apex legends,19,6,2019,20157.311065
7,Apex Legends,apex legends,20,6,2019,15973.658333
8,Apex Legends,apex legends,21,6,2019,15322.641667
9,Apex Legends,apex legends,22,6,2019,19100.837161


In [None]:
server.stop()

In [72]:
twitch_stats_df[twitch_stats_df['name'] == 'Tetris']

Unnamed: 0,name,norm_name,day,month,year,avgViewers
542,Tetris,tetris,13,6,2019,49.73913
543,Tetris,tetris,14,6,2019,35.179104
544,Tetris,tetris,15,6,2019,31.95207
545,Tetris,tetris,16,6,2019,51.653595
546,Tetris,tetris,17,6,2019,59.634409
547,Tetris,tetris,18,6,2019,54.09628
548,Tetris,tetris,19,6,2019,66.323913
549,Tetris,tetris,20,6,2019,29.947712
550,Tetris,tetris,21,6,2019,53.063596
551,Tetris,tetris,22,6,2019,44.438053


We add the column 'daily_top', which is the game's position in the ordered list of daily most streamed games on average.

In [73]:
twitch_stats_df['daily_top']=twitch_stats_df.sort_values(['day', 'month', 'year', 'avgViewers'],
                               ascending=[True, True, True, False]).groupby(['day', 'month', 'year']).cumcount()
twitch_stats_df['daily_top'] = twitch_stats_df['daily_top'] + 1

In [74]:
twitch_stats_df.sort_values(['day', 'month', 'year', 'avgViewers'], ascending=[True, True, True, False]).head(20)

Unnamed: 0,name,norm_name,day,month,year,avgViewers,daily_top
295,League of Legends,league of legends,13,6,2019,139492.283737,1
197,Fortnite,fortnite,13,6,2019,117869.685121,2
225,Grand Theft Auto V,grand theft auto v,13,6,2019,97704.619377,3
267,Just Chatting,just chatting,13,6,2019,97585.868512,4
167,Dota 2,dota ii,13,6,2019,69030.17301,5
636,World of Warcraft,world of warcraft,13,6,2019,46690.747405,6
430,PLAYERUNKNOWN'S BATTLEGROUNDS,playerunknown's battlegrounds,13,6,2019,40519.429066,7
137,Counter-Strike: Global Offensive,counter-strike global offensive,13,6,2019,39686.190311,8
416,Overwatch,overwatch,13,6,2019,24216.384083,9
346,Minecraft,minecraft,13,6,2019,22040.913495,10


# Twitter

In [27]:
db.list_collection_names()

['system.indexes', 'twitch', 'twitter']

Daily total retweets and likes for each game:

In [53]:
pipeline = [{ "$group": {
                "_id": {"name": "$query",
                        "day": {"$dayOfMonth": "$date" },
                        "month": { "$month": "$date" },
                        "year": { "$year": "$date" }},
                "tot_retweets": { "$sum": "$retweets" },
                "tot_likes": { "$sum" : "$likes"},
                "tweet_count" : { "$sum": 1 }
            }},
            {"$sort": {"_id": 1} }]

In [54]:
result2 = list(db.twitter.aggregate(pipeline))

df2 = json_normalize(result2)
df2 = df2.rename(index=str, columns={"_id.name": "name", "_id.day": "day","_id.month": "month","_id.year": "year"})
df2 = df2[['name', 'day', 'month', 'year', 'tot_retweets', 'tot_likes', 'tweet_count']]
df2.head(20)

Unnamed: 0,name,day,month,year,tot_retweets,tot_likes,tweet_count
0,apex legends,13,6,2019,54,349,96
1,apex legends,14,6,2019,140,1022,200
2,apex legends,15,6,2019,160,1154,200
3,apex legends,16,6,2019,173,1491,169
4,apex legends,17,6,2019,112,692,16
5,apex legends,18,6,2019,3517,15684,200
6,apex legends,19,6,2019,493,1366,79
7,apex legends,20,6,2019,615,3137,200
8,apex legends,21,6,2019,421,2873,200
9,apex legends,22,6,2019,228,1229,200


In [55]:
df2[df2['name'] == 'tetris' ]

Unnamed: 0,name,day,month,year,tot_retweets,tot_likes,tweet_count
526,tetris,13,6,2019,28,282,200
527,tetris,14,6,2019,726,4310,76
528,tetris,15,6,2019,60,597,200
529,tetris,16,6,2019,143,903,200
530,tetris,17,6,2019,175,830,200
531,tetris,18,6,2019,360,1830,110
532,tetris,19,6,2019,317,1547,200
533,tetris,20,6,2019,764,11352,200
534,tetris,21,6,2019,188,1452,200
535,tetris,22,6,2019,38,305,29


We add the column 'daily_top', which is the game's position in the ordered list of daily most liked tweets.

In [56]:
df2['daily_top'] = df2.sort_values(['day', 'month', 'year', 'tot_likes'],
                                   ascending=[True, True, True, False]).groupby(['day', 'month', 'year']).cumcount()
df2['daily_top'] = df2['daily_top'] + 1

In [57]:
df2.sort_values(['day', 'month', 'year', 'tot_likes'], ascending=[True, True, True, False]).head(20)

Unnamed: 0,name,day,month,year,tot_retweets,tot_likes,tweet_count,daily_top
186,fortnite,13,6,2019,419,7344,200,1
246,just chatting,13,6,2019,170,4403,179,2
310,minecraft,13,6,2019,290,2815,200,3
437,slots,13,6,2019,379,2018,200,4
379,overwatch,13,6,2019,144,696,200,5
12,auto chess,13,6,2019,113,671,56,6
234,hearthstone,13,6,2019,37,588,115,7
262,league of legends,13,6,2019,70,548,200,8
610,world of warcraft,13,6,2019,80,501,161,9
0,apex legends,13,6,2019,54,349,96,10


## Which is the  game with the highest retweet value on a given day?

### Pandas solution

In [64]:
df2.loc[df2.daily_top==1]

Unnamed: 0,name,day,month,year,tot_retweets,tot_likes,tweet_count,daily_top
186,fortnite,13,6,2019,419,7344,200,1
187,fortnite,14,6,2019,1878,34285,200,1
189,fortnite,16,6,2019,2268,33085,200,1
190,fortnite,17,6,2019,1056,35726,200,1
191,fortnite,18,6,2019,8851,53061,200,1
192,fortnite,19,6,2019,1592,15202,200,1
193,fortnite,20,6,2019,2154,30358,200,1
194,fortnite,21,6,2019,2977,22664,200,1
197,fortnite,24,6,2019,9168,39009,200,1
319,minecraft,22,6,2019,1303,43675,200,1


### MongoDB solution

In [73]:
pipeline_highest_retweet_per_day = [{
		"$group": {
			"_id": {
				"name": "$query",
				"day": {
					"$dayOfMonth": "$date"
				},
				"month": {
					"$month": "$date"
				},
				"year": {
					"$year": "$date"
				}
			},
			"tot_retweets": {
				"$sum": "$retweets"
			}
		}
	},
	{
		"$sort": {
			"tot_retweets": -1
		}
	},
	{
		"$group": {
			"_id": {
				"day": "$_id.day",
				"month": "$_id.month",
				"year": "$_id.year"
			},
			"max_retweets": {
				"$first": "$tot_retweets"
			},
			"game": {
				"$first": "$_id.name"
			}
		}
	},
	{
		"$project": {
			"_id": 0,
			"day": "$_id.day",
			"month": "$_id.month",
			"year": "$_id.year",
			"max_retweets": 1,
			"game": 1

		}
	},
	{
		"$sort": {
			"day": 1
		}
	}
]
highest_retweet_per_day = list(db.twitter.aggregate(pipeline_highest_retweet_per_day))
highest_retweet_per_day

[{'max_retweets': 419,
  'game': 'fortnite',
  'day': 13,
  'month': 6,
  'year': 2019},
 {'max_retweets': 1878,
  'game': 'fortnite',
  'day': 14,
  'month': 6,
  'year': 2019},
 {'max_retweets': 9617,
  'game': 'pokémon gold/silver',
  'day': 15,
  'month': 6,
  'year': 2019},
 {'max_retweets': 3365, 'game': 'slots', 'day': 16, 'month': 6, 'year': 2019},
 {'max_retweets': 6082,
  'game': 'overwatch',
  'day': 17,
  'month': 6,
  'year': 2019},
 {'max_retweets': 8851,
  'game': 'fortnite',
  'day': 18,
  'month': 6,
  'year': 2019},
 {'max_retweets': 1592,
  'game': 'fortnite',
  'day': 19,
  'month': 6,
  'year': 2019},
 {'max_retweets': 4211,
  'game': 'dead by daylight',
  'day': 20,
  'month': 6,
  'year': 2019},
 {'max_retweets': 3442,
  'game': 'dead by daylight',
  'day': 21,
  'month': 6,
  'year': 2019},
 {'max_retweets': 1303,
  'game': 'minecraft',
  'day': 22,
  'month': 6,
  'year': 2019},
 {'max_retweets': 2149, 'game': 'slots', 'day': 23, 'month': 6, 'year': 2019},
 {'m

### Which is the game with the highest likes for each day?

In [72]:
pipeline_highest_likes_per_day = [{
		"$group": {
			"_id": {
				"name": "$query",
				"day": {
					"$dayOfMonth": "$date"
				},
				"month": {
					"$month": "$date"
				},
				"year": {
					"$year": "$date"
				}
			},
			"tot_likes": {
				"$sum": "$likes"
			}
		}
	},
	{
		"$sort": {
			"tot_likes": -1
		}
	},
	{
		"$group": {
			"_id": {
				"day": "$_id.day",
				"month": "$_id.month",
				"year": "$_id.year"
			},
			"max_likes": {
				"$first": "$tot_likes"
			},
			"game": {
				"$first": "$_id.name"
			}
		}
	},
	{
		"$project": {
			"_id": 0,
			"day": "$_id.day",
			"month": "$_id.month",
			"year": "$_id.year",
			"max_likes": 1,
			"game": 1

		}
	},
	{
		"$sort": {
			"day": 1
		}
	}
]
highest_likes_per_day = list(db.twitter.aggregate(pipeline_highest_likes_per_day))
highest_likes_per_day

[{'max_likes': 7344, 'game': 'fortnite', 'day': 13, 'month': 6, 'year': 2019},
 {'max_likes': 34285, 'game': 'fortnite', 'day': 14, 'month': 6, 'year': 2019},
 {'max_likes': 48678,
  'game': 'pokémon gold/silver',
  'day': 15,
  'month': 6,
  'year': 2019},
 {'max_likes': 33085, 'game': 'fortnite', 'day': 16, 'month': 6, 'year': 2019},
 {'max_likes': 35726, 'game': 'fortnite', 'day': 17, 'month': 6, 'year': 2019},
 {'max_likes': 53061, 'game': 'fortnite', 'day': 18, 'month': 6, 'year': 2019},
 {'max_likes': 15202, 'game': 'fortnite', 'day': 19, 'month': 6, 'year': 2019},
 {'max_likes': 30358, 'game': 'fortnite', 'day': 20, 'month': 6, 'year': 2019},
 {'max_likes': 22664, 'game': 'fortnite', 'day': 21, 'month': 6, 'year': 2019},
 {'max_likes': 43675,
  'game': 'minecraft',
  'day': 22,
  'month': 6,
  'year': 2019},
 {'max_likes': 25375,
  'game': 'minecraft',
  'day': 23,
  'month': 6,
  'year': 2019},
 {'max_likes': 39009, 'game': 'fortnite', 'day': 24, 'month': 6, 'year': 2019}]

## Which is the day with the highest views/likes/retweets of all time? And which is the one with the lowest values?

In [17]:
pipeline_highest_likes_of_all_times = [{
		"$group": {
			"_id": {
				"day": {
					"$dayOfMonth": "$date"
				},
				"month": {
					"$month": "$date"
				},
				"year": {
					"$year": "$date"
				}
			},
			"tot_likes": {
				"$sum": "$likes"
			}
		}
	},
	{
		"$sort": {
			"tot_likes": -1
		}
	},
	{
		"$group": {
			"_id": {
				"day": "$_id.day",
				"month": "$_id.month",
				"year": "$_id.year"
			},
			"max_likes": {
				"$first": "$tot_likes"
			}
		}
	},
	{
		"$project": {
			"_id": 0,
			"day": "$_id.day",
			"month": "$_id.month",
			"year": "$_id.year",
			"max_likes": 1
		}
	},
	{
		"$sort": {
			"max_likes": -1
		}
	},
    {
        "$limit": 1
    }
]
highest_likes_of_all_times = db.twitter.aggregate(pipeline_highest_likes_of_all_times)
highest_likes_of_all_times.next()

{'max_likes': 129262, 'day': 15, 'month': 6, 'year': 2019}

In [19]:
pipeline_lowest_likes_of_all_times = [{
		"$group": {
			"_id": {
				"day": {
					"$dayOfMonth": "$date"
				},
				"month": {
					"$month": "$date"
				},
				"year": {
					"$year": "$date"
				}
			},
			"tot_likes": {
				"$sum": "$likes"
			}
		}
	},
	{
		"$sort": {
			"tot_likes": -1
		}
	},
	{
		"$group": {
			"_id": {
				"day": "$_id.day",
				"month": "$_id.month",
				"year": "$_id.year"
			},
			"max_likes": {
				"$first": "$tot_likes"
			}
		}
	},
	{
		"$project": {
			"_id": 0,
			"day": "$_id.day",
			"month": "$_id.month",
			"year": "$_id.year",
			"max_likes": 1
		}
	},
	{
		"$sort": {
			"max_likes": 1
		}
	},
    {
        "$limit": 1
    }
]
lowest_likes_of_all_times = db.twitter.aggregate(pipeline_lowest_likes_of_all_times)
lowest_likes_of_all_times.next()

{'max_likes': 22493, 'day': 13, 'month': 6, 'year': 2019}

# Total "buzz"

We are interested in using both Twitch and Twitter data for determing which game has the highest "buzz", that is for us the sum of the average views on Twitch and the total retweets and likes on Twitter.

In [76]:
pip = [{
		"$group": {
			"_id": {
				"name": "$query",
				"day": {
					"$dayOfMonth": "$date"
				},
				"month": {
					"$month": "$date"
				},
				"year": {
					"$year": "$date"
				}
			},
			"total_retweets": {
				"$sum": "$retweets"
			},
            "total_likes": {
                "$sum": "$likes"
            }            
		}
	},
    {
        "$project": {
            "_id": 0,
            "norm_name": "$_id.name",
            "day": "$_id.day",
            "month": "$_id.month",
            "year": "$_id.year",
            "total_retweets": 1,
            "total_likes": 1
        }
    },
	{
		"$sort": {
			"day": 1
		}
	}
]
tweet_stats_query_results = list(db.twitter.aggregate(pip))
tweet_stats_df = pd.DataFrame.from_records(tweet_stats_query_results)
tweet_stats_df.head()
twitch_stats_df.head()

Unnamed: 0,name,norm_name,day,month,year,avgViewers,daily_top
0,Apex Legends,apex legends,13,6,2019,17491.647059,14
1,Apex Legends,apex legends,14,6,2019,15606.502083,13
2,Apex Legends,apex legends,15,6,2019,15781.329167,14
3,Apex Legends,apex legends,16,6,2019,18600.824635,13
4,Apex Legends,apex legends,17,6,2019,19137.74375,11


**Let's join *tweet_stats_df* and *twitch_stats_df***

In [81]:
tweet_stats_df.dtypes

day                int64
month              int64
norm_name         object
total_likes        int64
total_retweets     int64
year               int64
dtype: object

In [82]:
twitch_stats_df.dtypes

name           object
norm_name      object
day            object
month          object
year           object
avgViewers    float64
daily_top       int64
dtype: object

Day, month and year are represented in different ways in the two dataframe. Let's convert the values on twitch_stats_df

In [86]:
twitch_stats_df.day= twitch_stats_df.day.astype(int)
twitch_stats_df.month= twitch_stats_df.month.astype(int)
twitch_stats_df.year= twitch_stats_df.year.astype(int)

In [87]:
combined_stats = pd.merge(tweet_stats_df, twitch_stats_df, on = ['norm_name', 'day', 'month', 'year'], how="inner")
combined_stats.head()

Unnamed: 0,day,month,norm_name,total_likes,total_retweets,year,name,avgViewers,daily_top
0,13,6,pokémon gold/silver,16,1,2019,Pokémon Gold/Silver,8.733813,40
1,13,6,hearthstone,588,37,2019,Hearthstone,21380.67128,11
2,13,6,grand theft auto san andreas,1,2,2019,Grand Theft Auto: San Andreas,455.910035,25
3,13,6,fortnite,7344,419,2019,Fortnite,117869.685121,2
4,13,6,playerunknown's battlegrounds,20,4,2019,PLAYERUNKNOWN'S BATTLEGROUNDS,40519.429066,7


In [91]:
combined_stats['buzz'] = combined_stats.total_likes + combined_stats.total_retweets + combined_stats.avgViewers
combined_stats.head()

Unnamed: 0,day,month,norm_name,total_likes,total_retweets,year,name,avgViewers,daily_top,buzz
0,13,6,pokémon gold/silver,16,1,2019,Pokémon Gold/Silver,8.733813,40,25.733813
1,13,6,hearthstone,588,37,2019,Hearthstone,21380.67128,11,22005.67128
2,13,6,grand theft auto san andreas,1,2,2019,Grand Theft Auto: San Andreas,455.910035,25,458.910035
3,13,6,fortnite,7344,419,2019,Fortnite,117869.685121,2,125632.685121
4,13,6,playerunknown's battlegrounds,20,4,2019,PLAYERUNKNOWN'S BATTLEGROUNDS,40519.429066,7,40543.429066


## In which day a game obtained the highest "buzz"?

In [92]:
combined_stats.loc[combined_stats['buzz'].idxmax()]

day                     21
month                    6
norm_name         fortnite
total_likes          22664
total_retweets        2977
year                  2019
name              Fortnite
avgViewers          181664
daily_top                1
buzz                207305
Name: 362, dtype: object

## In which day a game obtained the lowest "buzz"?

In [93]:
combined_stats.loc[combined_stats['buzz'].idxmin()]

day                                      13
month                                     6
norm_name         new super mario bros. wii
total_likes                               3
total_retweets                            0
year                                   2019
name              New Super Mario Bros. Wii
avgViewers                             2.75
daily_top                                43
buzz                                   5.75
Name: 35, dtype: object

In [94]:
server.stop()