# Snapshotting

## IML snapshots

### Functions

In [1]:
import requests
import jsonlines
from smart_open import open
import os
import json
import boto3
from impresso_commons.path.path_s3 import IMPRESSO_STORAGEOPT
from tqdm import tqdm

In [2]:
IML_baseuri = "https://impresso-project.ch/api/"
IML_articles_endpoint = os.path.join(IML_baseuri, "articles/")
IML_pages_endpoint = os.path.join(IML_baseuri, "pages/")

In [3]:
def get_total_contentitems(token):
    """Get total number of content items from IML."""
    response = requests.get(
        IML_articles_endpoint,
        headers={'Authorization': f'Bearer {jwt}'}
    )
    try:
        assert response.status_code == 200
    except Exception as e:
        raise e
        
    data = response.json()
    return data['total']
        

In [4]:
def get_contentitem_ids(skip, limit, token):
    """Get a bunch of content item IDs from IML."""
    payload = {
        "skip": skip,
        "limit": limit
    }
    response = requests.get(
        IML_articles_endpoint,
        headers={'Authorization': f'Bearer {jwt}'},
        params=payload
    )
    try:
        assert response.status_code == 200
    except Exception as e:
        print(response.status_code)
        raise e
        
    data = response.json()['data']
    return [
        {         
            "id": ci["uid"],
            "np": ci['newspaper']["uid"],
            "tp": ci["type"],
            "t": ci["title"]
        }
        for ci in data
    ]

In [5]:
def get_total_pages(token):
    """Get total number of pages from IML."""
    response = requests.get(
        IML_pages_endpoint,
        headers={'Authorization': f'Bearer {jwt}'}
    )
    print(f'Querying {IML_pages_endpoint}')
    try:
        assert response.status_code == 200
    except Exception as e:
        print(f'Got {response.status_code}')
        raise e
        
    data = response.json()
    return data

In [6]:
def get_page_ids(skip, limit, token):
    """Get a bunch of pages from IML."""
    payload = {
        "skip": skip,
        "limit": limit
    }
    response = requests.get(
        IML_pages_endpoint,
        headers={'Authorization': f'Bearer {jwt}'},
        params=payload
    )
    try:
        assert response.status_code == 200
    except Exception as e:
        print(IML_pages_endpoint)
        print(response.status_code)
        raise e
        
    data = response.json()['data']
    return [
        page
        for page in data
    ]

In [18]:
def write_snapshot_to_s3(data, path):
    
    session = boto3.Session(
        aws_access_key_id=IMPRESSO_STORAGEOPT['key'],
        aws_secret_access_key=IMPRESSO_STORAGEOPT['secret'],
    )
    
    transport_params = {
        'session': session,
        'resource_kwargs': {
            'endpoint_url': IMPRESSO_STORAGEOPT['client_kwargs']['endpoint_url'],
        }
    }
    
    with open(path, 'w', transport_params=transport_params) as outfile:
        
        with jsonlines.Writer(outfile) as writer:
            writer.write_all(data)
    
    return
    

### Snapshot of Content Item IDs

In [14]:
# once logged in the impresso interface, the token can be found
# in the browser localstorage. For Chrome/Chromium go to: developer tools > application > localstorage
# and copy the value of variable `feathers_jwt`

jwt = "eyJhbGciOiJIUzI1NiIsInR5cCI6ImFjY2VzcyJ9.eyJ1c2VySWQiOiJsb2NhbC1tciIsImlzU3RhZmYiOnRydWUsImciOlsibmRhIl0sImlhdCI6MTU1NzQ4NjAwNywiZXhwIjoxNTU4MDkwODA3LCJhdWQiOiJodHRwOi8vbG9jYWxob3N0OjMwMzAiLCJpc3MiOiJmZWF0aGVycyIsInN1YiI6ImFub255bW91cyIsImp0aSI6IjU5NDAzNTQ5LTg4MGQtNDE5OC1iYmUzLTMyNGUxMjcyZGE4MSJ9.7VSuaJ8IpGDp3YWjFNMcQK1j0feK5Q2kWU76nNMA3pg"

In [15]:
req_limit = 100

In [16]:
total = get_total_contentitems(token=jwt)

In [11]:
offsets = list(range(0, total, req_limit))

In [12]:
len(offsets)

273086

In [13]:
offsets[:1000]

[0,
 100,
 200,
 300,
 400,
 500,
 600,
 700,
 800,
 900,
 1000,
 1100,
 1200,
 1300,
 1400,
 1500,
 1600,
 1700,
 1800,
 1900,
 2000,
 2100,
 2200,
 2300,
 2400,
 2500,
 2600,
 2700,
 2800,
 2900,
 3000,
 3100,
 3200,
 3300,
 3400,
 3500,
 3600,
 3700,
 3800,
 3900,
 4000,
 4100,
 4200,
 4300,
 4400,
 4500,
 4600,
 4700,
 4800,
 4900,
 5000,
 5100,
 5200,
 5300,
 5400,
 5500,
 5600,
 5700,
 5800,
 5900,
 6000,
 6100,
 6200,
 6300,
 6400,
 6500,
 6600,
 6700,
 6800,
 6900,
 7000,
 7100,
 7200,
 7300,
 7400,
 7500,
 7600,
 7700,
 7800,
 7900,
 8000,
 8100,
 8200,
 8300,
 8400,
 8500,
 8600,
 8700,
 8800,
 8900,
 9000,
 9100,
 9200,
 9300,
 9400,
 9500,
 9600,
 9700,
 9800,
 9900,
 10000,
 10100,
 10200,
 10300,
 10400,
 10500,
 10600,
 10700,
 10800,
 10900,
 11000,
 11100,
 11200,
 11300,
 11400,
 11500,
 11600,
 11700,
 11800,
 11900,
 12000,
 12100,
 12200,
 12300,
 12400,
 12500,
 12600,
 12700,
 12800,
 12900,
 13000,
 13100,
 13200,
 13300,
 13400,
 13500,
 13600,
 13700,
 13800,


In [21]:
ci_ids = []

for offset in tqdm(offsets):
     ci_ids += get_contentitem_ids(offset, limit=req_limit, token=jwt)

  4%|▎         | 9749/273086 [2:54:15<107:51:04,  1.47s/it]

KeyboardInterrupt: 

In [50]:
ci_ids

[{'id': 'IMP-2002-04-23-a-i0172', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0198', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0255', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0298', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0037', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0084', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0105', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0123', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0173', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0199', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0217', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0256', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0299', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0020', 'np': 'IMP', 'tp': 'ad', 't': ''},
 {'id': 'IMP-2002-04-23-a-i0038', 

In [96]:
write_snapshot_to_s3(
    ci_ids,
    "s3://snapshots/alpha/iml.jsonl.bz2"
)

### Snapshot of pages

In [None]:
total_pages = get_total_pages(token=jwt)

In [25]:
total_pages

27308519

In [26]:
pages_offsets = list(range(0, total, req_limit))

In [37]:
page_ids = []

for offset in tqdm(pages_offsets[:1]):
    page_ids += get_page_ids(offset, limit=req_limit, token=jwt)


  0%|          | 0/1 [00:00<?, ?it/s][A

405


AssertionError: 

#### Snapshot of Page IDs

## MySQL snapshots

In [8]:
import impresso_db
from impresso_db.base import Session, engine

In [9]:
os.environ["IMPRESSO_DB_CONFIG"] = "prod"

In [10]:
%%time
with engine.connect() as db_conn:
    q = "SELECT id, newspaper_id, title, type FROM impresso.content_items;"
    ids = db_conn.execute(q)

CPU times: user 4min 14s, sys: 12.9 s, total: 4min 27s
Wall time: 4min 27s


In [11]:
mysql_ci_ids = [
    {
        "id": ci_id,
        "np": np_id,
        "tp": ci_type,
        "t": ci_title
    }
    for ci_id, np_id, ci_title, ci_type in ids
]

In [12]:
len(mysql_ci_ids)

26521317

In [19]:
write_snapshot_to_s3(
    mysql_ci_ids,
    "s3://snapshots/alpha/mysql.jsonl.bz2"
)

In [21]:
!s3cmd ls s3://snapshots/alpha/

2019-05-10 10:43      2828   s3://snapshots/alpha/iml.jsonl.bz2
2019-05-10 07:49      2803   s3://snapshots/alpha/iml.txt.bz2
2019-05-13 10:22 190696389   s3://snapshots/alpha/mysql.jsonl.bz2
