In [None]:
import pymongo
import json
from sshtunnel import SSHTunnelForwarder

In this notebook there are the code chunks used for downloading the data stored in the VM collection and the ones useful for inserting the data in another instance. The recommended way for exporting and importing data on MongoDB is through the "mongoimport" and "mongoexport" utilities. We have chosen to use a programmatic approach because it helps us to filter the data we'd like to export: in particular we just wanted to export data stored before the 28th of June because that was the week when we wrote the project report.

# Download data from VM collections

### SSH Tunnel

In [None]:
with open('../keys.json') as k:
    keys = json.loads(k.read())
    MONGO_HOST = keys["SSH"]["MONGO_HOST"] # i.e. 10.9.13.14
    MONGO_DB = keys["SSH"]["MONGO_DB"] # i.e. dm_project 
    MONGO_USER = keys["SSH"]["MONGO_USER"] # i.e. studente
    MONGO_PASS = keys["SSH"]["MONGO_PASS"] # i.e. la password della vm

server = SSHTunnelForwarder(
    MONGO_HOST,
    ssh_username=MONGO_USER,
    ssh_password=MONGO_PASS,
    remote_bind_address=('127.0.0.1', 27017)
)

In [None]:
server.start() # remember to stop

In [None]:
client = pymongo.MongoClient('127.0.0.1', server.local_bind_port) # server.local_bind_port is assigned local port
db = client[MONGO_DB]
db.list_collection_names()

The import of the dumps function is useful for converting ObjectID and datetime structures in JSON-Compatible ones

In [None]:
from bson.json_util import dumps

### Export twitch data

The Twitch collection is by far the biggest one on our project. For this reason we will work on smaller subsets, so that it will be possible to download the collection files without incurring in Memory Errors.

In [None]:
pipeline = [{ "$match" : {'timestamp' : {'$lt' : '2019-06-15'}}}]
twitch_cursor = db.twitch.aggregate(pipeline)


In [None]:
twitch_dump_before_06_15 = dumps(twitch_cursor)

In [None]:
with open("twitch_before_06_15.json", "w") as f:
        f.write(twitch_dump_before_06_15)

In [None]:
pipeline_15_20 = [{ "$match" : {'$and' : [
                {'timestamp' : {'$gte' : '2019-06-15'}},
                {'timestamp' : {'$lt' : '2019-06-20'}}
            ]}}]
twitch_cursor_15_20 = db.twitch.aggregate(pipeline_15_20)
twitch_dump_15_20 = dumps(twitch_cursor_15_20)
with open("twitch_06_15_20.json", "w") as f:
        f.write(twitch_dump_15_20)

In [None]:
pipeline_20_24 = [{ "$match" : {'$and' : [
                {'timestamp' : {'$gte' : '2019-06-20'}},
                {'timestamp' : {'$lt' : '2019-06-24'}}
            ]}}]
twitch_cursor_20_24 = db.twitch.aggregate(pipeline_20_24)
twitch_dump_20_24 = dumps(twitch_cursor_20_24)
with open("twitch_06_20_24.json", "w") as f:
        f.write(twitch_dump_20_24)

In [None]:
del twitch_dump_20_24

In [None]:
pipeline_24_28 = [{ "$match" : {'$and' : [
                {'timestamp' : {'$gte' : '2019-06-24'}},
                {'timestamp' : {'$lt' : '2019-06-28'}}
            ]}}]
twitch_cursor_24_28 = db.twitch.aggregate(pipeline_24_28)
twitch_dump_24_28 = dumps(twitch_cursor_24_28)
with open("twitch_06_24_28.json", "w") as f:
        f.write(twitch_dump_24_28)

In [None]:
del twitch_dump_24_28

### Export twitter collection

In [None]:
db.twitter.find_one()

In [None]:
from datetime import datetime
end = datetime(2019, 6, 28, 0, 0, 0)

In [None]:
pipeline = [{ "$match" : {'date' : {'$lt' : end}}}]
twitter = db.twitter.aggregate(pipeline)
twitter_dump = dumps(twitter)
with open("twitter.json", "w") as f:
        f.write(twitter_dump)

**Rember to close the connection!**

In [None]:
server.stop()

# Insert download data on another MongoDB Instance

## Important: change all IP and path according to your sistem configuration. Do not change the DB and collections names, otherwise the other notebooks won't work

In [None]:
client = pymongo.MongoClient("localhost", 27017)
db = client["dm_project"]
db.list_collection_names()

In [None]:
from bson.json_util import loads

In [None]:
def load_json_dump(fn):
    with open(fn) as f:
        collection = loads(f.read())
    return collection


### Twitch

In [None]:
twitch_files = ["twitch_before_06_15.json", "twitch_06_15_20.json", "twitch_06_20_24.json", "twitch_06_24_28.json"]

In [None]:
for file in twitch_files:
    print(file)
    collection = load_json_dump(file)
    print("Loaded")
    db.twitch.insert_many(collection)
    print("Inserted!")

Check if data was loaded correctly

In [None]:
db.twitch.find_one()

In [None]:
db.twitch.count()

# Insert Twitter Data

In [None]:
twitter_collection = load_json_dump("twitter.json")

In [None]:
db.twitter.insert_many(twitter_collection)

In [None]:
db.twitter.find_one()