In [1]:
from bson.son import SON
import json
from pymongo import MongoClient
from bson.objectid import ObjectId
import pandas as pd
from pandas.io.json import json_normalize
from datetime import datetime

## Connect to remote mongodb

In [2]:
from sshtunnel import SSHTunnelForwarder
import pymongo

In [3]:
with open('keys.json') as k:
    keys = json.loads(k.read())
    MONGO_HOST = keys["SSH"]["MONGO_HOST"] # i.e. 10.9.13.14
    MONGO_DB = keys["SSH"]["MONGO_DB"] # i.e. dm_project 
    MONGO_USER = keys["SSH"]["MONGO_USER"] # i.e. studente
    MONGO_PASS = keys["SSH"]["MONGO_PASS"] # i.e. la password della vm

In [4]:
server = SSHTunnelForwarder(
    MONGO_HOST,
    ssh_username=MONGO_USER,
    ssh_password=MONGO_PASS,
    remote_bind_address=('127.0.0.1', 27017)
)

In [5]:
server.start() # remember to stop

In [6]:
client = pymongo.MongoClient('127.0.0.1', server.local_bind_port) # server.local_bind_port is assigned local port

In [7]:
db = client[MONGO_DB]

In [8]:
db.list_collection_names()

['system.indexes', 'twitch', 'twitter']

# Integration of previous locally dowloaded data. 
### Script on VM started on 23/06/2019

In [None]:
# with open('20190613_0932_data.json', "r", encoding = 'utf8') as json_file:
#     inserted_ids = []
#     for line in json_file:
#         data = json.loads(line)
#         post_id = db.twitch.insert_one(data).inserted_id
#         inserted_ids.append(post_id)

# Querying Twitch

In [10]:
pipeline = [{"$project":
             {'_id': 0,
              'time':'$timestamp'} }]

In [11]:
tsmps = list(db.twitch.aggregate(pipeline))

In [12]:
len(tsmps)

5258

In [13]:
start = tsmps[3]['time'] # this is the oldest timestamp
end = tsmps[-1]['time'] # this is the most recent one
start = datetime.strptime(start, "%Y-%m-%d %H:%M:%S.%f")
end = datetime.strptime(end, "%Y-%m-%d %H:%M:%S.%f")

start, end

(datetime.datetime(2019, 6, 13, 9, 32, 34, 348514),
 datetime.datetime(2019, 6, 24, 8, 34, 34, 77407))

In [14]:
(end-start).total_seconds() / 180

5260.665160516667

The following is the list of games we are interested in, so that we can restrict the queries on them if we want.

In [15]:
top_games = ['Apex Legends', 'Auto Chess', 'Bio Inc. Redemption', 'Call of Duty: Advanced Warfare',
             'Call of Duty: Black Ops', 'Call of Duty: Black Ops 4',
             'Call of Duty: Black Ops II', 'Call of Duty: Black Ops III',
             'Call of Duty: Ghosts', 'Call of Duty: Modern Warfare 3',
             'Clam Man', 'Counter-Strike: Global Offensive',
             'Dead by Daylight', 'Death Stranding', 'Dota 2', 'Duck Hunt', 'FIFA 19', 'Fortnite',
             'Grand Theft Auto IV', 'Grand Theft Auto V', 'Grand Theft Auto: San Andreas', 'Hearthstone',
             'Just Chatting', 'Layers of Fear 2', 'League of Legends', 'Magic: The Gathering',
             'Mario Kart DS', 'Mario Kart Wii', 'Minecraft', 'Modern Warfare 2',
             'Music & Performing Arts', 'New Super Mario Bros.',
             'New Super Mario Bros. Wii', 'Overwatch',
             "PLAYERUNKNOWN'S BATTLEGROUNDS", 'Pokémon Gold/Silver',
             'Pokémon Red/Blue', 'Slots', 'Splitgate: Arena Warfare', 'Super Mario Bros.',
             'Super Mario World', 'Talk Shows & Podcasts', 'Tetris', 'The Elder Scrolls V: Skyrim',
             "Tom Clancy's Rainbow Six: Siege", 'Total War: Three Kingdoms',
             'Wii Play', 'Wii Sports', 'Wii Sports Resort', 'World of Warcraft']

Average daily views for each game:

In [16]:
pipeline = [{"$unwind": "$data"},
            {"$match" : { "data.game_name" : { "$in" : top_games },  }},
            { "$group": {
                "_id": {"name": "$data.game_name",
                        "year" :  { "$substr" : ["$timestamp", 0, 4 ] }, 
                        "month" : { "$substr" : ["$timestamp", 5, 2 ] },     
                        "day" :   { "$substr" : ["$timestamp", 8, 2 ] } },
                "avgViewers": { "$avg": '$data.viewers'}
            }},
            {"$sort": {"_id": 1} }]

In [17]:
# this is pretty slow
result = list(db.twitch.aggregate(pipeline))

The following are the games initially chosen as top games, but that have not been collected again on twitch.

In [18]:
result_set = set([elem['_id']['name'] for elem in result])
top_set = set(top_games)

top_set - result_set

{'Clam Man'}

In [19]:
df = json_normalize(result)
df = df.rename(index=str, columns={"_id.name": "name", "_id.day": "day","_id.month": "month","_id.year": "year"})
df = df[['name', 'day', 'month', 'year', 'avgViewers']]

df.head(20)

Unnamed: 0,name,day,month,year,avgViewers
0,Apex Legends,13,6,2019,17491.647059
1,Apex Legends,14,6,2019,15606.502083
2,Apex Legends,15,6,2019,15781.329167
3,Apex Legends,16,6,2019,18600.824635
4,Apex Legends,17,6,2019,19137.74375
5,Apex Legends,18,6,2019,19605.116667
6,Apex Legends,19,6,2019,20157.311065
7,Apex Legends,20,6,2019,15973.658333
8,Apex Legends,21,6,2019,15322.641667
9,Apex Legends,22,6,2019,19100.837161


In [20]:
server.stop()

In [21]:
df[df['name'] == 'Tetris']

Unnamed: 0,name,day,month,year,avgViewers
464,Tetris,13,6,2019,49.73913
465,Tetris,14,6,2019,35.179104
466,Tetris,15,6,2019,31.95207
467,Tetris,16,6,2019,51.653595
468,Tetris,17,6,2019,59.634409
469,Tetris,18,6,2019,54.09628
470,Tetris,19,6,2019,66.323913
471,Tetris,20,6,2019,29.947712
472,Tetris,21,6,2019,53.063596
473,Tetris,22,6,2019,44.438053


We add the column 'daily_top', which is the game's position in the ordered list of daily most streamed games on average.

In [22]:
df['daily_top']=df.sort_values(['day', 'month', 'year', 'avgViewers'],
                               ascending=[True, True, True, False]).groupby(['day', 'month', 'year']).cumcount()
df['daily_top'] = df['daily_top'] + 1

In [23]:
df.sort_values(['day', 'month', 'year', 'avgViewers'], ascending=[True, True, True, False]).head(20)

Unnamed: 0,name,day,month,year,avgViewers,daily_top
252,League of Legends,13,6,2019,139492.283737,1
168,Fortnite,13,6,2019,117869.685121,2
192,Grand Theft Auto V,13,6,2019,97704.619377,3
228,Just Chatting,13,6,2019,97585.868512,4
142,Dota 2,13,6,2019,69030.17301,5
544,World of Warcraft,13,6,2019,46690.747405,6
368,PLAYERUNKNOWN'S BATTLEGROUNDS,13,6,2019,40519.429066,7
116,Counter-Strike: Global Offensive,13,6,2019,39686.190311,8
356,Overwatch,13,6,2019,24216.384083,9
296,Minecraft,13,6,2019,22040.913495,10


# Twitter

In [24]:
server.start()

In [25]:
client = pymongo.MongoClient('127.0.0.1', server.local_bind_port) # server.local_bind_port is assigned local port

In [26]:
db = client[MONGO_DB]

In [27]:
db.list_collection_names()

['system.indexes', 'twitch', 'twitter']

Daily total retweets and likes for each game:

In [28]:
pipeline = [{ "$group": {
                "_id": {"name": "$query",
                        "day": {"$dayOfMonth": "$date" },
                        "month": { "$month": "$date" },
                        "year": { "$year": "$date" }},
                "tot_retweets": { "$sum": "$retweets" },
                "tot_likes": { "$sum" : "$likes"},
                "tweet_count" : { "$sum": 1 }
            }},
            {"$sort": {"_id": 1} }]

In [30]:
result2 = list(db.twitter.aggregate(pipeline))

df2 = json_normalize(result2)
df2 = df2.rename(index=str, columns={"_id.name": "name", "_id.day": "day","_id.month": "month","_id.year": "year"})
df2 = df2[['name', 'day', 'month', 'year', 'tot_retweets', 'tot_likes', 'tweet_count']]
df2.head(20)

Unnamed: 0,name,day,month,year,tot_retweets,tot_likes,tweet_count
0,apex legends,13,6,2019,54,349,96
1,apex legends,14,6,2019,140,1022,200
2,apex legends,15,6,2019,160,1154,200
3,apex legends,16,6,2019,173,1491,169
4,auto chess,13,6,2019,113,671,56
5,auto chess,14,6,2019,37,635,200
6,auto chess,15,6,2019,29,1320,200
7,auto chess,16,6,2019,49,630,200
8,bio inc. redemption,13,6,2019,0,0,1
9,bio inc. redemption,14,6,2019,0,0,1


In [31]:
df2[df2['name'] == 'tetris' ]

Unnamed: 0,name,day,month,year,tot_retweets,tot_likes,tweet_count
170,tetris,13,6,2019,28,282,200
171,tetris,14,6,2019,726,4310,76
172,tetris,15,6,2019,60,597,200
173,tetris,16,6,2019,143,903,200
174,tetris,17,6,2019,175,830,200


In [32]:
server.stop()

We add the column 'daily_top', which is the game's position in the ordered list of daily most liked tweets.

In [33]:
df2['daily_top'] = df2.sort_values(['day', 'month', 'year', 'tot_likes'],
                                   ascending=[True, True, True, False]).groupby(['day', 'month', 'year']).cumcount()
df2['daily_top'] = df2['daily_top'] + 1

In [34]:
df2.sort_values(['day', 'month', 'year', 'tot_likes'], ascending=[True, True, True, False]).head(20)

Unnamed: 0,name,day,month,year,tot_retweets,tot_likes,tweet_count,daily_top
60,fortnite,13,6,2019,419,7344,200,1
81,just chatting,13,6,2019,170,4403,179,2
107,minecraft,13,6,2019,290,2815,200,3
147,slots,13,6,2019,379,2018,200,4
129,overwatch,13,6,2019,144,696,200,5
4,auto chess,13,6,2019,113,671,56,6
77,hearthstone,13,6,2019,37,588,115,7
88,league of legends,13,6,2019,70,548,200,8
204,world of warcraft,13,6,2019,80,501,161,9
0,apex legends,13,6,2019,54,349,96,10
