In [1]:
import pymongo
import json
from sshtunnel import SSHTunnelForwarder
import os
from bson.objectid import ObjectId
import pandas as pd
import numpy as np

All the data downloaded from Twitch and Twitter for this project is stored in a MongoDB of a Virtual Machine.

A copy of the data from this db can be found in the 'dataset' folder, so that it can be uploaded to a local MongoDB instance and accessed by someone without access to the VM.

In this notebook there are the code chunks used for downloading a copy of the data stored in the VM collection and the ones useful for inserting the data in another local instance.
The recommended way for exporting and importing data on MongoDB is through the "mongoimport" and "mongoexport" utilities. We have chosen to use a programmatic approach because it helps us to filter the data we'd like to export: in particular we just wanted to export data stored before the 28th of June because that was the week when we wrote the project report.

The copy of the data for Twitch and Twitter stored in the MongoDB of the VM can be found as zipped files in the "dataset" folder. The VGChartz data is stored in the same folder as the csv file 'vgsales.csv' created from scraping the VGChartz database.

# Insert data on a MongoDB Instance

## Important: please change all IP and path according to your sistem configuration. Do not change the DB and collections names, otherwise the other notebooks won't work

In [2]:
client = pymongo.MongoClient("localhost", 27017)
db = client["dm_project"]
db.list_collection_names()

[]

In [3]:
from bson.json_util import loads

In [4]:
def load_json_dump(fn):
    with open(fn) as f:
        collection = loads(f.read())
    return collection


### Inserting Twitch Data

Please unzip the files inside the 'dataset' folder before proceeding.

The json files that we are now going to upload to the local MongoDB collection contain 5GB of data, so that the process may take a few minutes. This is only a part of the full database on the VM!

In [5]:
twitch_files = ["../dataset/twitch_before_06_15.json", "../dataset/twitch_06_15_20.json", "../dataset/twitch_06_20_24.json", "../dataset/twitch_06_24_28.json"]

In [6]:
for file in twitch_files:
    print(file)
    collection = load_json_dump(file)
    print("Loaded")
    db.twitch.insert_many(collection)
    print("Inserted!")

../dataset/twitch_before_06_15.json
Loaded
Inserted!
../dataset/twitch_06_15_20.json
Loaded
Inserted!
../dataset/twitch_06_20_24.json
Loaded
Inserted!
../dataset/twitch_06_24_28.json
Loaded
Inserted!


Check if data was loaded correctly

In [12]:
test_doc = db.twitch.find_one({'timestamp' : {"$lt" : "2019-06-01"}})

Each document uploaded to the db has two fields, apart from the '_id', a timestamp and a data field 

In [13]:
test_doc.keys()

dict_keys(['_id', 'timestamp', 'data'])

The timestamp is the time of collection:

In [14]:
test_doc['timestamp']

'2019-05-27 18:46:09.148466'

The data is an array of documents, each related to a game:

In [15]:
test_doc['data'][:5]

[{'game': {'name': 'Fortnite',
   'popularity': 171268,
   '_id': 33214,
   'giantbomb_id': 37030,
   'box': {'large': 'https://static-cdn.jtvnw.net/ttv-boxart/Fortnite-272x380.jpg'},
   'logo': {'large': 'https://static-cdn.jtvnw.net/ttv-logoart/Fortnite-240x144.jpg'},
   'localized_name': 'Fortnite',
   'norm_name': 'fortnite',
   'unary_name': 'fortnite'},
  'viewers': 181860,
  'channels': 14657},
 {'game': {'name': 'Grand Theft Auto V',
   'popularity': 121282,
   '_id': 32982,
   'giantbomb_id': 36765,
   'box': {'large': 'https://static-cdn.jtvnw.net/ttv-boxart/Grand%20Theft%20Auto%20V-272x380.jpg'},
   'logo': {'large': 'https://static-cdn.jtvnw.net/ttv-logoart/Grand%20Theft%20Auto%20V-240x144.jpg'},
   'localized_name': 'Grand Theft Auto V',
   'norm_name': 'grand theft auto v',
   'unary_name': 'grand theft auto v'},
  'viewers': 134132,
  'channels': 1571},
 {'game': {'name': 'League of Legends',
   'popularity': 128373,
   '_id': 21779,
   'giantbomb_id': 24024,
   'box': {

Note that at some point the structure of the data was slighlty changed, so that the 'game' field in the data collected in June has been flattened for better readibility, and some fields have been added or deleted.

In [16]:
test_doc2 = db.twitch.find_one({'timestamp' : {"$gt" : "2019-06-13"}})

In [17]:
test_doc2['data'][:3]

[{'channels': 1789,
  'game__id': 21779,
  'game_box_large': 'https://static-cdn.jtvnw.net/ttv-boxart/League%20of%20Legends-272x380.jpg',
  'game_giantbomb_id': 24024,
  'game_logo_large': 'https://static-cdn.jtvnw.net/ttv-logoart/League%20of%20Legends-240x144.jpg',
  'game_name': 'League of Legends',
  'game_norm_name': 'league of legends',
  'game_popularity': 111662,
  'viewers': 137213},
 {'channels': 834,
  'game__id': 509658,
  'game_box_large': 'https://static-cdn.jtvnw.net/ttv-boxart/Just%20Chatting-272x380.jpg',
  'game_giantbomb_id': 0,
  'game_logo_large': 'https://static-cdn.jtvnw.net/ttv-logoart/Just%20Chatting-240x144.jpg',
  'game_name': 'Just Chatting',
  'game_norm_name': 'just chatting',
  'game_popularity': 61477,
  'viewers': 66157},
 {'channels': 3472,
  'game__id': 33214,
  'game_box_large': 'https://static-cdn.jtvnw.net/ttv-boxart/Fortnite-272x380.jpg',
  'game_giantbomb_id': 37030,
  'game_logo_large': 'https://static-cdn.jtvnw.net/ttv-logoart/Fortnite-240x144.j

The following is the count of documents in all the collection: keep in mind that each document contains the data field, with is an array of more than 1000 documents, sometimes more than 2000.

In [18]:
db.twitch.estimated_document_count()

8204

In [19]:
len(test_doc2['data'])

1281

We create an index on the collection that should speed up some queries:

In [20]:
db.twitch.create_index("timestamp", background=True)

'timestamp_1'

### Inserting Twitter Data

In [21]:
twitter_collection = load_json_dump("../dataset/twitter.json")

In [22]:
db.twitter.insert_many(twitter_collection)

<pymongo.results.InsertManyResult at 0x1e83925db08>

In [23]:
db.twitter.find_one()

{'_id': ObjectId('5d0fa455b2b670923ac1437d'),
 'retweets': 0,
 'text': "@Didact343 I spent far too long considering this. Went with Cyberpunk as it's a totally new IP and Death Stranding… https://t.co/jN5UmcffS0",
 'user_followers': 450,
 'language': 'en',
 'query': 'death stranding',
 'username': 'Biggus Bennus',
 'likes': 1,
 'date': datetime.datetime(2019, 6, 13, 23, 58, 57),
 'user_location': 'Cardiff, Wales'}

In [24]:
db.twitter.estimated_document_count()

87412

### Inserting VGChartz Data

In [25]:
vgsales = pd.read_csv("../dataset/vgsales.csv")
vgsales.head()

Unnamed: 0,Rank,Name,Platform,Year,href_Genre,Publisher,Developer,Critic_Score,User_Score,NA_Sales,PAL_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,http://www.vgchartz.com/game/2667/wii-sports/?...,Nintendo,Nintendo EAD,7.7,,41.36,29.02,3.77,8.51,82.65
1,2,Super Mario Bros.,NES,1985.0,http://www.vgchartz.com/game/6455/super-mario-...,Nintendo,Nintendo EAD,10.0,8.2,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,http://www.vgchartz.com/game/6968/mario-kart-w...,Nintendo,Nintendo EAD,8.2,9.1,15.91,12.92,3.8,3.35,35.98
3,4,PLAYERUNKNOWN'S BATTLEGROUNDS,PC,2017.0,http://www.vgchartz.com/game/215988/playerunkn...,PUBG Corporation,PUBG Corporation,,,,,,,
4,5,Wii Sports Resort,Wii,2009.0,http://www.vgchartz.com/game/24656/wii-sports-...,Nintendo,Nintendo EAD,8.0,8.8,15.61,10.99,3.29,3.02,32.9


In [26]:
vgsales_without_na = vgsales[np.isfinite(vgsales['Global_Sales'])]
vgsales_without_na.head()

Unnamed: 0,Rank,Name,Platform,Year,href_Genre,Publisher,Developer,Critic_Score,User_Score,NA_Sales,PAL_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,http://www.vgchartz.com/game/2667/wii-sports/?...,Nintendo,Nintendo EAD,7.7,,41.36,29.02,3.77,8.51,82.65
1,2,Super Mario Bros.,NES,1985.0,http://www.vgchartz.com/game/6455/super-mario-...,Nintendo,Nintendo EAD,10.0,8.2,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,http://www.vgchartz.com/game/6968/mario-kart-w...,Nintendo,Nintendo EAD,8.2,9.1,15.91,12.92,3.8,3.35,35.98
4,5,Wii Sports Resort,Wii,2009.0,http://www.vgchartz.com/game/24656/wii-sports-...,Nintendo,Nintendo EAD,8.0,8.8,15.61,10.99,3.29,3.02,32.9
5,6,Pokémon Red / Green / Blue Version,GB,1998.0,http://www.vgchartz.com/game/4030/pokemon-red-...,Nintendo,Game Freak,9.4,,11.27,8.89,10.22,1.0,31.37


In [27]:
df_dict = vgsales_without_na.to_dict('records')

In [28]:
db.vgchartz.insert_many(df_dict)

<pymongo.results.InsertManyResult at 0x1e850df8c88>

In [29]:
db.vgchartz.find_one()

{'_id': ObjectId('5d206c00f0bd0629f4a0722c'),
 'Rank': 1,
 'Name': 'Wii Sports',
 'Platform': 'Wii',
 'Year': 2006.0,
 'href_Genre': 'http://www.vgchartz.com/game/2667/wii-sports/?region=All',
 'Publisher': 'Nintendo  ',
 'Developer': 'Nintendo EAD  ',
 'Critic_Score': 7.7,
 'User_Score': nan,
 'NA_Sales': 41.36,
 'PAL_Sales': 29.02,
 'JP_Sales': 3.77,
 'Other_Sales': 8.51,
 'Global_Sales': 82.65}

Now all the collections are loaded on the db:

In [30]:
db.list_collection_names()

['vgchartz', 'twitch', 'twitter']

# Download data from VM collections

The follwing code was run to download a copy of the data from the database on the VM. It should not be run without access to the VM.

### SSH Tunnel

In [None]:
with open('../keys.json') as k:
    keys = json.loads(k.read())
    MONGO_HOST = keys["SSH"]["MONGO_HOST"] # i.e. 10.9.13.14
    MONGO_DB = keys["SSH"]["MONGO_DB"] # i.e. dm_project 
    MONGO_USER = keys["SSH"]["MONGO_USER"] # i.e. studente
    MONGO_PASS = keys["SSH"]["MONGO_PASS"] # i.e. la password della vm

server = SSHTunnelForwarder(
    MONGO_HOST,
    ssh_username=MONGO_USER,
    ssh_password=MONGO_PASS,
    remote_bind_address=('127.0.0.1', 27017)
)

In [None]:
server.start() # remember to stop

In [None]:
client = pymongo.MongoClient('127.0.0.1', server.local_bind_port) # server.local_bind_port is assigned local port
db = client[MONGO_DB]
db.list_collection_names()

The import of the dumps function is useful for converting ObjectID and datetime structures in JSON-Compatible ones

In [None]:
from bson.json_util import dumps

### Export twitch data

The Twitch collection is by far the biggest one on our project. For this reason we will work on smaller subsets, so that it will be possible to download the collection files without incurring in Memory Errors.

In [None]:
pipeline = [{ "$match" : {'timestamp' : {'$lt' : '2019-06-15'}}}]
twitch_cursor = db.twitch.aggregate(pipeline)


In [None]:
twitch_dump_before_06_15 = dumps(twitch_cursor)

In [None]:
with open("twitch_before_06_15.json", "w") as f:
        f.write(twitch_dump_before_06_15)

In [None]:
pipeline_15_20 = [{ "$match" : {'$and' : [
                {'timestamp' : {'$gte' : '2019-06-15'}},
                {'timestamp' : {'$lt' : '2019-06-20'}}
            ]}}]
twitch_cursor_15_20 = db.twitch.aggregate(pipeline_15_20)
twitch_dump_15_20 = dumps(twitch_cursor_15_20)
with open("twitch_06_15_20.json", "w") as f:
        f.write(twitch_dump_15_20)

In [None]:
pipeline_20_24 = [{ "$match" : {'$and' : [
                {'timestamp' : {'$gte' : '2019-06-20'}},
                {'timestamp' : {'$lt' : '2019-06-24'}}
            ]}}]
twitch_cursor_20_24 = db.twitch.aggregate(pipeline_20_24)
twitch_dump_20_24 = dumps(twitch_cursor_20_24)
with open("twitch_06_20_24.json", "w") as f:
        f.write(twitch_dump_20_24)

In [None]:
del twitch_dump_20_24

In [None]:
pipeline_24_28 = [{ "$match" : {'$and' : [
                {'timestamp' : {'$gte' : '2019-06-24'}},
                {'timestamp' : {'$lt' : '2019-06-28'}}
            ]}}]
twitch_cursor_24_28 = db.twitch.aggregate(pipeline_24_28)
twitch_dump_24_28 = dumps(twitch_cursor_24_28)
with open("twitch_06_24_28.json", "w") as f:
        f.write(twitch_dump_24_28)

In [None]:
del twitch_dump_24_28

### Export twitter collection

In [None]:
db.twitter.find_one()

In [None]:
from datetime import datetime
end = datetime(2019, 6, 28, 0, 0, 0)

In [None]:
pipeline = [{ "$match" : {'date' : {'$lt' : end}}}]
twitter = db.twitter.aggregate(pipeline)
twitter_dump = dumps(twitter)
with open("twitter.json", "w") as f:
        f.write(twitter_dump)

**Rember to close the connection!**

In [None]:
server.stop()