In [1]:
import lxml.html
import ujson
import gzip
import csv

In [2]:
from pdb import set_trace as debug
from IPython.parallel import Client,require
from collections import Counter
from pandas.io.parsers import read_csv

In [3]:
#--> RAW_FILE_DIR with the directory that contains google_play__main.json.gz
#--> OUT_DIR with the directory of where you want the processed files to go
#--> DEBUG whether to print out created CSV files or not
#--> COMPRESS_LEVEL gzip compression level. warning: severely impacts runtime.
#--> LINE_LIMIT how many lines to iterate through in the raw file. For debugging.
DUMP_DATE = '2015_03_07_23_12'
RAW_FILE_DIR = '/Users/kushagrasharma/Coding/data'
OUT_DIR = '/Users/kushagrasharma/Coding/data/out'
DEBUG = 1
COMPRESS_LEVEL = 1
LINE_LIMIT = 50 # -1 means run for the entire file
PARALLEL = 0

In [4]:
# Don't change these!
RAW_FILE_IPHONE = '{}/test_appannie__iphone.json.gz'.format(RAW_FILE_DIR)
RAW_FILE_IPAD = '{}/test_appannie__ipad.json.gz'.format(RAW_FILE_DIR)
RAW_FILE_GOOGLE = '{}/test_appannie__google.json.gz'.format(RAW_FILE_DIR)

OUT_BASE = OUT_DIR+'/{}__'+DUMP_DATE+'.csv.gz'

In [5]:
# ! zcat "$RAW_FILE_IPHONE" | head -n1
# ! zcat "$RAW_FILE_IPAD" | head -n1
# ! zcat "$RAW_FILE_GOOGLE" | head -n1

In [6]:
def iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT,no_verify_appid_time=1):
    with gzip.open(filename,'rt') as file_iter:
        c = 0
        for line in file_iter:
            c +=1
            if c > LINE_LIMIT and LINE_LIMIT>0:
                break
            if isinstance(line,str):
                if len(line)>0:        
                    out = ujson.loads(line)
                    if isinstance(out,dict):
                        if 'app_id' in out and 'timestamp' in out \
                        or no_verify_appid_time:
                            yield out

In [7]:
if PARALLEL:
    ipython_parallel = Client()
    print("{} active computing engines".format(len(ipython_parallel.ids)))

    lbv = ipython_parallel.load_balanced_view()

    map = lambda f,itertable:lbv.map(f,itertable,\
    block =False,\
    ordered =False)

    @require('socket')
    def host(dummy):
        return socket.gethostname()
    
    nodes = list(lbv.map(host,ipython_parallel.ids))
    nodes = [int(x.split('equity')[1].split(".")[0]) for x in nodes]
    nodes = nodes

    print(sorted(list(Counter(nodes).items())))

# RANKINGSIPHONE

In [8]:
@require(iter_json_gzip,'ujson','gzip','csv','lxml.html')
def RANKPARSER_IOS(file_in,out_base,compress=1):
    headers = ['app_id', 'chart', 'date', 'country', 'rank']
    out_filename = out_base.format('RANKINGSIPHONE')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            tree = lxml.html.fromstring(json['html'])
            app_ids = tree.xpath("//span[@style='display:none']")
            chart = ['free', 'paid', 'grossing']
            for i,app in enumerate(app_ids):
                app_info = [app.text_content(), chart[i%3], json['date'], 'United States', int(((i/3)+1))]
                csv_writer.writerow(app_info)
    return out_filename

In [9]:
df_debug = None
if DEBUG:
    df_debug = read_csv(RANKPARSER_IOS(RAW_FILE_IPHONE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,chart,date,country,rank
0,826459257,free,2014-03-01,United States,1
1,503190232,paid,2014-03-01,United States,1
2,529479190,grossing,2014-03-01,United States,1
3,808176012,free,2014-03-01,United States,2
4,479516143,paid,2014-03-01,United States,2
5,553834731,grossing,2014-03-01,United States,2
6,766894692,free,2014-03-01,United States,3
7,623592465,paid,2014-03-01,United States,3
8,667728512,grossing,2014-03-01,United States,3
9,825289439,free,2014-03-01,United States,4


# RANKINGSGOOGLE

In [10]:
@require(iter_json_gzip,'ujson','gzip','csv','lxml.html')
def RANKINGSGOOGLE(file_in,out_base,compress=1):
    headers = ['app_id', 'chart', 'date', 'country', 'rank']
    out_filename = out_base.format('RANKPARSER_GOOGLE')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            tree = lxml.html.fromstring(json['html'])
            app_ids = tree.xpath("//span[@style='display:none']")
            chart = ['free', 'paid', 'grossing', 'new_free', 'new_paid']
            for i,app in enumerate(app_ids):
                app_info = [app.text_content(), chart[i%5], json['date'], 'United States', int(((i/5)+1))]
                csv_writer.writerow(app_info)
    return out_filename

In [11]:
df_debug = None
if DEBUG:
    df_debug = read_csv(RANKINGSGOOGLE(RAW_FILE_GOOGLE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,chart,date,country,rank
0,com.facebook.katana,free,2014-03-01,United States,1
1,com.mojang.minecraftpe,paid,2014-03-01,United States,1
2,com.supercell.clashofclans,grossing,2014-03-01,United States,1
3,it.junglestudios.splashyfish,new_free,2014-03-01,United States,1
4,com.FireproofStudios.TheRoom2,new_paid,2014-03-01,United States,1
5,com.pandora.android,free,2014-03-01,United States,2
6,com.touchtype.swiftkey,paid,2014-03-01,United States,2
7,com.king.candycrushsaga,grossing,2014-03-01,United States,2
8,com.square_enix.android_googleplay.deadmanscro...,new_free,2014-03-01,United States,2
9,com.mtvn.sbmigoogleplay,new_paid,2014-03-01,United States,2


# RANKINGSIPAD

In [12]:
@require(iter_json_gzip,'ujson','gzip','csv','lxml.html')
def RANKINGSIPAD(file_in,out_base,compress=1):
    headers = ['app_id', 'chart', 'date', 'country', 'rank']
    out_filename = out_base.format('RANKINGSIPAD')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            tree = lxml.html.fromstring(json['html'])
            app_ids = tree.xpath("//span[@style='display:none']")
            chart = ['free', 'paid', 'grossing']
            for i,app in enumerate(app_ids):
                app_info = [app.text_content(), chart[i%3], json['date'], 'United States', int(((i/3)+1))]
                csv_writer.writerow(app_info)
    return out_filename

In [13]:
df_debug = None
if DEBUG:
    df_debug = read_csv(RANKINGSIPAD(RAW_FILE_IPAD,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,chart,date,country,rank
0,766894692,free,2014-03-01,United States,1
1,479516143,paid,2014-03-01,United States,1
2,529479190,grossing,2014-03-01,United States,1
3,826459257,free,2014-03-01,United States,2
4,667362389,paid,2014-03-01,United States,2
5,553834731,grossing,2014-03-01,United States,2
6,808176012,free,2014-03-01,United States,3
7,791341471,paid,2014-03-01,United States,3
8,667728512,grossing,2014-03-01,United States,3
9,808032599,free,2014-03-01,United States,4
