Create Journal Metadata Sqlite
===

Using the most refined journal metadata, create a sqlite database.

`journal_updated_metadata.df` (a feather file) is the most recent version and the last created in the notebook that does so: `caringbridge_core/raw_data_conversion/JournalMetadataExtractionUpdated.ipynb`

In [2]:
import numpy as np
import pandas as pd

import os
import sqlite3
from datetime import datetime

In [11]:
input_filepath = "/home/srivbane/shared/caringbridge/data/derived/journal_metadata/journal_updated_metadata.df"
output_filepath = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/sqlite/journal_metadata.sqlite"

In [4]:
!du -h {input_filepath}

2.6G	/home/srivbane/shared/caringbridge/data/derived/journal_metadata/journal_updated_metadata.df


In [5]:
df = pd.read_feather(input_filepath)
len(df)

17893390

In [6]:
df.sample(n=5)

Unnamed: 0,site_id,journal_oid,user_id,userId_repr,created_at,updated_at,createdAt_repr,updatedAt_repr,published_at,amp_count,photos_count,videos_count,platform,lastEdit,site_index,is_nontrivial
4346681,102178,51be12c56ca004fb3000dd0d,4654152,4654152.0,1237671900000,1237675500000,1237672000000.0,1237676000000.0,0,0,0,0,,,60,True
878292,17044,51bdf7f56ca004335b00bae0,688679,688679.0,1152188220000,1152188220000,1152188000000.0,1152188000000.0,0,0,0,0,,,13,True
4277203,100196,51be12186ca004ed2e007e6c,4347944,4347944.0,1244301660000,0,1244302000000.0,0.0,0,0,0,0,,,20,True
5017547,121376,51be17ce6ca004c84000e7c0,5560119,5560119.0,1244363460000,0,1244363000000.0,0.0,0,0,0,0,,,90,True
3962986,90774,51be0f5e6ca004602600d24e,4126958,4126958.0,1239828240000,0,1239828000000.0,0.0,0,0,0,0,,,54,True


In [7]:
df.isna().sum()

site_id                  0
journal_oid              0
user_id                  0
userId_repr          63323
created_at               0
updated_at               0
createdAt_repr          44
updatedAt_repr          94
published_at             0
amp_count                0
photos_count             0
videos_count             0
platform          16026882
lastEdit          13850907
site_index               0
is_nontrivial            0
dtype: int64

In [8]:
df = df[['site_id', 'journal_oid', 'user_id', 'created_at', 'updated_at', 'published_at', 'amp_count', 'photos_count', 'videos_count', 'platform', 'lastEdit', 'site_index', 'is_nontrivial']]
df.head()

Unnamed: 0,site_id,journal_oid,user_id,created_at,updated_at,published_at,amp_count,photos_count,videos_count,platform,lastEdit,site_index,is_nontrivial
0,1,51bdf3e56ca0048f4e00ced4,1,1103856900000,1103856900000,0,1,0,0,,,0,True
1,1,51bdf3e56ca0048f4e00ced2,1,1103943240000,1103943240000,0,0,0,0,,,1,True
2,1,51bdf3e56ca0048f4e00ced0,1,1104288840000,1104288840000,0,0,0,0,,,2,True
3,1,51bdf3e56ca0048f4e00cece,1,1104461640000,1104461640000,0,0,0,0,,,3,True
4,1,51bdf3e56ca0048f4e00cecc,1,1104807180000,1104807180000,0,0,0,0,,,4,True


In [10]:
df.dtypes

site_id            int64
journal_oid       object
user_id            int64
created_at         int64
updated_at         int64
published_at       int64
amp_count          int64
photos_count       int64
videos_count       int64
platform          object
lastEdit         float64
site_index         int64
is_nontrivial       bool
dtype: object

In [12]:
def get_db(db_filename):
    db = sqlite3.connect(
            db_filename,
            detect_types=sqlite3.PARSE_DECLTYPES
        )
    db.row_factory = sqlite3.Row
    return db

conn = get_db(output_filepath)
s = datetime.now()
df.to_sql('journal', conn, index=False, chunksize=1000000)
print(datetime.now() - s)

0:01:34.905085


I created the following indices:

    CREATE INDEX journal_journalOid ON journal (journal_oid);
    CREATE INDEX journal_siteId ON journal (site_id);
    CREATE INDEX journal_userId ON journal (user_id);