# Dask and Castra

**This cell shows the fix for the `metadata.json` file:** basically, we use the library `ast` for properly encoding the JSON file, this means a slower encoding but anyway it works fine.

In [1]:
import gzip
import ast

metadata_gz = "/home/ec2-user/amazon_dataset/metadata.json.gz"
fivecore = '/home/ec2-user/amazon_dataset/kcore_5.json.gz'

with gzip.open(metadata_gz, 'rb') as f:
    for i,l in enumerate(f):
        try:
            k = ast.literal_eval(l)
        except ValueError, e: 
            print '\nERR\n',e,'\n' , l
            break
        if i % 1e5 == 0 and i>0:
            print i,
        
        #Just not to process the whole file, we stop the computation after 1.000.000 lines
        if i % 1e6 == 0 and i>0:
            break

print i
print "Done"

100000 200000 300000 400000 500000 600000 700000 800000 900000 1000000 1000000
Done


Now, the final version of the working implementation, step by step:

All the **imports**:

In [1]:
import ujson
import gzip
import ast
from pandas import DataFrame
from toolz import dissoc
from toolz import dissoc, partition_all
from castra import Castra
import time
import datetime
import dask.dataframe as dd
import dask.bag as db
from dask.diagnostics import ProgressBar

Initializing variables like **paths**, **column names** and **chunk size**:

In [18]:
path_to = "/home/ec2-user/amazon_dataset/"
reviews = "kcore_5.json.gz"
metadata = "metadata.json.gz"
reviews_columns = ['asin', 'reviewerID','reviewerName', 'overall','summary','reviewText','reviewTime','unixReviewTime']
metadata_columns = ['asin','title','price','imUrl','related','also_bought','also_viewed','bought_together','salesRank','brand','categories']
chunksize = 5000

Implementation of the user defined **functions** we used in the script:

In [19]:
#Convert a line of JSON into a cleaned up dict.
def to_json(line):
    return ujson.loads(line)

#Convert a not proper line of JSON (due to single quotes) into a cleaned up dict.
def fix_json(line):
    return ast.literal_eval(line)

#Convert a list of JSON strings into a DataFrame
def to_df(batch,filename):
    if filename == 'metadata':
        blobs = map(fix_json,batch)
        df = DataFrame.from_records(blobs, columns=metadata_columns)
    else:
        blobs = map(to_json, batch)
        df = DataFrame.from_records(blobs, columns=reviews_columns)
    return df

#Create the castra dataset for improved I/O operations with Dask DataFrames
#We can work properly on compressed GZ files with gzip library
#The chunk size is 5000, which means that 5000 lines per time will be processed
def create_castra(fullpath,chunksize):
    filename = fullpath.split('/')[-1].split('.')[0]
    with gzip.open(fullpath,'rb') as f:
        batches = partition_all(chunksize, f)
        castra = None
        for batch in batches:
            df = to_df(batch,filename)
            if castra == None:
                castra = Castra(path_to+filename+'.castra', template=df)
            castra.extend(df)

**IMPORTANT!!! DON'T RUN TWICE!** | **Execution of the script**: this may take a while... 

In [5]:
print 'Starting the creation of the Castra files...'

#Creating the castra file for metadata
print 'Processing compressed metadata...'
start = time.time()
create_castra(path_to+metadata,chunksize)
end = time.time()
print "Done! Metadata processed in:",datetime.timedelta(seconds=(end-start))

Starting the creation of the Castra files...
Processing compressed metadata...
Done! Metadata processed in: 0:29:01.749161


In [6]:
#Creating the castra file for the 5_cores
print 'Processing compressed 5_cores...'
start = time.time()
create_castra(path_to+reviews,chunksize)
end = time.time()
print "Done! Reviews data processed in:",datetime.timedelta(seconds=(end-start))

Processing compressed 5_cores...
Done! Reviews data processed in: 0:26:57.887684


After the creation of the castra files, we can start playing with the **Dask DataFrames**:

In [2]:
# Start a progress bar for all computations
pbar = ProgressBar(minimum=3.0,dt=0.5)
pbar.register()

# Load data into a dask dataframe:
path_to_castra = '/home/ec2-user/amazon_dataset/kcore_5.castra'
df = dd.from_castra(path_to_castra)

In [3]:
df.head(5)

Unnamed: 0,asin,reviewerID,reviewerName,overall,summary,reviewText,reviewTime,unixReviewTime
0,13714,ACNGUPJ3A3TM9,GCM,4.0,Nice Hymnal,We use this type of hymnal at church. I was l...,"12 3, 2013",1386028800
1,13714,A2SUAM1J3GNN3B,J. McDonald,5.0,Heavenly Highway Hymns,I bought this for my husband who plays the pia...,"09 13, 2009",1252800000
2,13714,APOZ15IEYQRRR,maewest64,5.0,Awesome Hymn Book,This is a large size hymn book which is great ...,"03 9, 2013",1362787200
3,13714,AYEDW3BFK53XK,Missb,5.0,Hand Clapping Toe Tapping Oldies,We use this hymn book at the mission. It has ...,"01 2, 2012",1325462400
4,13714,A1KLCGLCXYP1U1,"Paul L ""Paul Lytle""",3.0,Misleading,"One review advised this book was large print, ...","08 10, 2013",1376092800


In [4]:
df.count().compute()

[########################################] | 100% Completed | 10min 49.6s


asin              41135700
overall           41135700
reviewText        41135700
reviewTime        41135700
reviewerID        41135700
reviewerName      40203149
summary           41135700
unixReviewTime    41135696
dtype: int64

In [5]:
df.asin.value_counts().nlargest(10).compute()

[########################################] | 100% Completed |  1min 28.2s


B00FAPF5U0    13550
B0051VVOB2    11981
B0074BW614    10836
030758836X    10552
0439023483    10404
B00DR0PDNE    10139
B007WTAJTO     9771
B006GWO5WK     9008
B005SUHPO6     8963
B0064X7B4A     8808
Name: asin, dtype: int64

## Trying just with Dask importing the `.json` file without dataframes

In [7]:
js = db.read_text("/home/ec2-user/amazon_dataset/reviews_Books_5.json").map(ujson.loads)
js.count().compute()

[########################################] | 100% Completed |  2min 27.1s


8898041