In [4]:
import io
import os

In [5]:
import pandas as pd
import numpy as np
import pickle

In [7]:
import boto3
from botocore.client import Config
import s3fs

In [8]:
ACCESS_KEY_ID = os.environ.get('STEMSEARCH_AWS_ACCESS_KEY_ID')
SECRET_ACCESS_KEY = os.environ.get('STEMSEARCH_AWS_SECRET_ACCESS_KEY')
BUCKET_NAME = os.environ.get('STEMSEARCH_AWS_BUCKET_NAME')

In [43]:
with open('../model/arxiv_model.pkl', 'rb') as f:
    model = pickle.load(f)

model

MiniBatchKMeans(batch_size=1500, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=1000, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)

# Using boto3

In [9]:
# S3 Connect
s3 = boto3.resource(
    's3',
    aws_access_key_id=ACCESS_KEY_ID,
    aws_secret_access_key=SECRET_ACCESS_KEY,
    config=Config(signature_version='s3v4')
)

In [10]:
bucket = s3.Bucket(BUCKET_NAME)

In [11]:
for o in bucket.objects.all():
    print(o.key)

arxiv_model.pkl
arxiv_papers.parquet
cord_papers.parquet
covid_model.pkl
covid_tfidf_vectorizer.pkl
tfidf_vectorizer.pkl


In [30]:
obj = s3.Object(BUCKET_NAME, 'tfidf_vectorizer.pkl')

In [31]:
body = obj.get()['Body']

In [32]:
vec = body.read()

In [33]:
pickle.loads(vec)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [12]:
obj2 = s3.Object(BUCKET_NAME, 'arxiv_model.pkl')

In [13]:
body2 = obj2.get()['Body']

In [14]:
model = body2.read()

In [15]:
pickle.loads(model)

MiniBatchKMeans(batch_size=1500, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=1000, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)

In [19]:
obj3 = s3.Object(BUCKET_NAME, 'cord_papers.parquet')

In [21]:
pd.read_parquet(obj3.get()['Body']).head()

AttributeError: 'StreamingBody' object has no attribute 'closed'


ValueError: I/O operation on closed file

# Using s3fs

In [22]:
fs = s3fs.S3FileSystem(anon=False)

In [23]:
path = 's3://stemsearch/cord_papers.parquet'

In [24]:
d = pd.read_parquet(path)

In [25]:
d.head()

Unnamed: 0,title,authors,abstract,publish_time,journal
0,Intrauterine virus infections and congenital h...,"Overall, James C.",Abstract The etiologic basis for the vast majo...,1972-12-31,American Heart Journal
1,Clinical and immunologic studies in identical ...,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...","Abstract Middle-aged female identical twins, o...",1973-08-31,The American Journal of Medicine
2,Epidemiology of community-acquired respiratory...,"Garibaldi, Richard A.",Abstract Upper respiratory tract infections ar...,1985-06-28,The American Journal of Medicine
3,Infectious diarrhea: Pathogenesis and risk fac...,"Cantey, J.Robert",Abstract Our understanding of the pathogenesis...,1985-06-28,The American Journal of Medicine
4,New perspectives on the pathogenesis of rheuma...,"Zvaifler, Nathan J.",Abstract In the pathogenesis of rheumatoid art...,1988-10-14,The American Journal of Medicine


In [55]:
fs.ls(path)

['stemsearch/cord_papers.parquet']

In [58]:
with fs.open(path) as f:
    print(pd.read_parquet(f).head())

                                               title  \
0  Intrauterine virus infections and congenital h...   
1  Clinical and immunologic studies in identical ...   
2  Epidemiology of community-acquired respiratory...   
3  Infectious diarrhea: Pathogenesis and risk fac...   
4  New perspectives on the pathogenesis of rheuma...   

                                             authors  \
0                                  Overall, James C.   
1  Brunner, Carolyn M.; Horwitz, David A.; Shann,...   
2                              Garibaldi, Richard A.   
3                                   Cantey, J.Robert   
4                                Zvaifler, Nathan J.   

                                            abstract publish_time  \
0  Abstract The etiologic basis for the vast majo...   1972-12-31   
1  Abstract Middle-aged female identical twins, o...   1973-08-31   
2  Abstract Upper respiratory tract infections ar...   1985-06-28   
3  Abstract Our understanding of the pathogenesis.

In [59]:
vec_path = 's3://stemsearch/covid_tfidf_vectorizer.pkl'

In [64]:
with fs.open(vec_path) as f:
    covid_vectorizer = pickle.load(f)

In [65]:
covid_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [82]:
model_path = 's3://stemsearch/covid_model.pkl'
with fs.open(model_path) as f:
    covid_model = pickle.load(f)

KeyboardInterrupt: 

In [71]:
covid_model

MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=800, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=3)

In [27]:
import time

In [29]:
ts = time.time()

model_path = 's3://stemsearch/arxiv_model.pkl'
with fs.open(model_path) as f:
    covid_model = pickle.load(f)

te = time.time()
print('time elapsed: {}s', te-ts)

time elapsed: {}s 893.7476079463959


In [30]:
covid_model

MiniBatchKMeans(batch_size=1500, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=1000, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)