# Initialize Environment

### Compatibility notes

The following code is only compatible with a python 3.4 kernel and the notebook must be opened with ipython 3.0+.

The following libraries must be installed to run this code:

```pip install ujson```

In [30]:
from pdb import set_trace as debug
from pandas.io.parsers import read_csv
from IPython.parallel import Client,require
from collections import Counter

In [31]:
import ujson
import datetime
import re
import gzip
import csv

In [32]:
#--> RAW_FILE_DIR with the directory that contains google_play__main.json.gz
#--> OUT_DIR with the directory of where you want the processed files to go
#--> DEBUG whether to print out created CSV files or not
#--> COMPRESS_LEVEL gzip compression level. warning: severely impacts runtime.
#--> LINE_LIMIT how many lines to iterate through in the raw file. For debugging.
DUMP_DATE = '2015_02_26_23_12'
RAW_FILE_DIR = '/home/cgaray/data'
OUT_DIR = '/home/cgaray/data/out'
DEBUG = 1
COMPRESS_LEVEL = 1
LINE_LIMIT = 100
PARALLEL = 0

In [33]:
# Don't change these!
RAW_FILE = '{}/google_play__main.json.gz'.format(RAW_FILE_DIR)
OUT_BASE = OUT_DIR+'/{}__google_play__main__'+DUMP_DATE+'.csv.gz'

In [34]:
def iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT):
    with gzip.open(filename,'rt') as file_iter:
        c = 0
        for line in file_iter:
            c +=1
            if c > LINE_LIMIT and LINE_LIMIT>0:
                break
            yield ujson.loads(line)

In [35]:
# Print a couple observations to inspect daw data.
!zcat "$RAW_FILE"|head -n 1000 | tail -n5

{"app_id":"com.custom.lwp.BridgeLights","ratings":[0,0,0,0,1],"month":7,"store_name":"google_play","size":"190k","content":"Everyone","hostname":"equity4.mit.edu","requires":"2.1 and up","website":"http:\/\/manumandroidapp.blogspot.com\/&sa=D&usg=AFQjCNGPFmMP-3N1t7XLfyI8TTF2lgZj9A","version":"1.0","year":2013,"category":"PERSONALIZATION","developer_name":"manum app","email":"manu_m.91@hotmail.it","screenshots":2,"timestamp":1374095742,"description":"<div>Bridge Lights is a Live Wallpaper.<p>After you have downloaded and installed it,  You need in order to set it  as a Live Wallpaper:<p>- go to your home screen;<br>- press the menu button on your device;<br>- select &quot;Wallpaper&quot;;<br>- select &quot;Live Wallpapers&quot;;<br>- search and select for this Live Wallpaper&quot;<br>- select &quot;Settings&quot; to see the settings or select &quot;Set wallpaper&quot; to activate it.<p>System Requirement:<p>any phone that came with Android OS 2.0 or later should work fine. Anything that

#Initialize Parallel Processing

In [36]:
if PARALLEL:
    ipython_parallel = Client()
    print("{} active computing engines".format(len(ipython_parallel.ids)))

    lbv = ipython_parallel.load_balanced_view()

    map = lambda f,itertable:lbv.map(f,itertable,\
    block =False,\
    ordered =False)

    @require('socket')
    def host(dummy):
        return socket.gethostname()
    
    nodes = list(lbv.map(host,ipython_parallel.ids))
    nodes = [int(x.split('equity')[1].split(".")[0]) for x in nodes]
    nodes = sorted(nodes)

    print(list(Counter(nodes).items()))

# Daily Ratings -- DAILYRATINGS

In [37]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def DAILYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','date','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('DAILYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .strftime('%Y-%m-%d')
            rating = json.get('ratings',None)
            if rating:
                obs = (json['app_id'],day,tuple(rating),)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],day]+rating)
    return out_filename

In [38]:
df_debug = None
if DEBUG:
    df_debug = read_csv(DAILYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,rating5,rating4,rating3,rating2,rating1
0,com.touchtype.swiftkey,2013-07-17,162422,33001,6799,2879,4429
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,291722,45460,13064,4439,18714
2,com.ibm.events.android.usga,2013-07-17,420,97,65,46,87
3,com.dama.camera2,2013-07-17,152,49,23,10,9
4,com.disney.scribblefree_goo,2013-07-17,1209,219,178,139,713
5,com.gau.go.launcherex,2013-07-17,1011807,233232,79681,22214,44348
6,com.teslacoilsw.launcher,2013-07-17,59107,11001,2450,764,1113
7,com.levelup.beautifulwidgets,2013-07-17,53501,15796,5546,3248,6737
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,481,253,129,36,58
9,com.mdb.android.xiangqi,2013-07-17,40,20,8,5,23


# Developer - app_id -- NEWDEVELOPER

In [39]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWDEVELOPER(file_in,out_base,compress=1):
    headers = ['app_id','date','dev_id']
    out_filename = out_base.format('NEWDEVELOPER')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            developer = json.get('developer_id','')
            if developer == None:
                continue
            developer = developer.strip()
            if running.get(json['app_id'],'')!=developer:
                running[json['app_id']] = developer
                day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d')
                obs = (json['app_id'],day,developer)
                csv_writer.writerow(obs) 
    return out_filename

In [40]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWDEVELOPER(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,dev_id
0,com.touchtype.swiftkey,2013-07-17,SwiftKey
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,Gameloft
2,com.ibm.events.android.usga,2013-07-17,"United+States+Golf+Association,+USGA"
3,com.dama.camera2,2013-07-17,JFDP+Labs
4,com.disney.scribblefree_goo,2013-07-17,Disney
5,com.gau.go.launcherex,2013-07-17,GO+Launcher+Dev+Team
6,com.teslacoilsw.launcher,2013-07-17,TeslaCoil+Software
7,com.levelup.beautifulwidgets,2013-07-17,LevelUp+Studio
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,Notes
9,com.mdb.android.xiangqi,2013-07-17,MDB+Softwares


# Category - app_id -- NEWCATEGORY

In [41]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWCATEGORY(file_in,out_base,compress=1):
    headers = ['app_id','date','category']
    out_filename = out_base.format('NEWCATEGORY')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            category = json.get('category','')
            if category == None:
                continue
            category = category.strip()
            if running.get(json['app_id'],'')!=category:
                running[json['app_id']] = category
                day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d')
                obs = (json['app_id'],day,category)
                csv_writer.writerow(obs) 
    return out_filename

In [42]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWCATEGORY(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,category
0,com.touchtype.swiftkey,2013-07-17,PRODUCTIVITY
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,CASUAL
2,com.ibm.events.android.usga,2013-07-17,SPORTS
3,com.dama.camera2,2013-07-17,PHOTOGRAPHY
4,com.disney.scribblefree_goo,2013-07-17,BRAIN
5,com.gau.go.launcherex,2013-07-17,PERSONALIZATION
6,com.teslacoilsw.launcher,2013-07-17,PERSONALIZATION
7,com.levelup.beautifulwidgets,2013-07-17,PERSONALIZATION
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,BOOKS_AND_REFERENCE
9,com.mdb.android.xiangqi,2013-07-17,BRAIN


# Price changes -- NEWPRICE

In [43]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWPRICE(file_in,out_base,compress=1):
    headers = ['app_id','date','price']
    out_filename = out_base.format('NEWPRICE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            price = json.get('price',None)
            if running.get(json['app_id'],None)!=price:
                running[json['app_id']] = price
                day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d')
                obs = (json['app_id'],day,price)
                csv_writer.writerow(obs)
    return out_filename

In [44]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWPRICE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,price
0,com.touchtype.swiftkey,2013-07-17,3.99
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,0.00
2,com.ibm.events.android.usga,2013-07-17,0.00
3,com.dama.camera2,2013-07-17,2.99
4,com.disney.scribblefree_goo,2013-07-17,0.00
5,com.gau.go.launcherex,2013-07-17,0.00
6,com.teslacoilsw.launcher,2013-07-17,0.00
7,com.levelup.beautifulwidgets,2013-07-17,2.59
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,0.00
9,com.mdb.android.xiangqi,2013-07-17,0.00


# Version changes  -- NEWVERSION

In [45]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWVERSION(file_in,out_base,compress=1):
    headers = ['app_id','date','version']
    out_filename = out_base.format('NEWVERSION')
    with gzip.open(out_filename,'wt',compress,) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        running = {}
        nowhite = re.compile(r"\s+")
        for json in iter_json_gzip(file_in):
            version = json.get('version','')
            version = version.replace('Varies with device','varies')
            version = nowhite.sub('',version).lower()
            if running.get(json['app_id'],'')!=version:
                running[json['app_id']] = version
                day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d')
                obs = (json['app_id'],day,version)
                csv_writer.writerow(obs)
    return out_filename

In [46]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWVERSION(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,version
0,com.touchtype.swiftkey,2013-07-17,varies
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,1.0.0
2,com.ibm.events.android.usga,2013-07-17,2.2
3,com.dama.camera2,2013-07-17,1.0.6
4,com.disney.scribblefree_goo,2013-07-17,1.0.2
5,com.gau.go.launcherex,2013-07-17,varies
6,com.teslacoilsw.launcher,2013-07-17,2.1.1
7,com.levelup.beautifulwidgets,2013-07-17,5.3.1
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,1.0.4
9,com.mdb.android.xiangqi,2013-07-17,2.6.12


# Updated Changes -- NEWUPDATED

In [47]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWUPDATED(file_in,out_base,compress=1):
    headers = ['app_id','updated',]
    out_filename = out_base.format('NEWUPDATED')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        running = {}
        nowhite = re.compile(r"\s+")
        for json in iter_json_gzip(file_in):
            updated = json.get('updated','')
            if updated == None:
                continue
            updated = updated.strip()
            if running.get(json['app_id'],'')!=updated and ''!= updated and None !=updated:
                running[json['app_id']] = updated
                date = datetime.datetime.strptime(updated,'%B %d, %Y')
                date = date.strftime('%Y-%m-%d')
                obs = (json['app_id'],date)
                csv_writer.writerow(obs)
    return out_filename

In [48]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWUPDATED(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,updated
0,com.touchtype.swiftkey,2013-07-01
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-16
2,com.ibm.events.android.usga,2013-06-14
3,com.dama.camera2,2013-07-03
4,com.disney.scribblefree_goo,2013-07-03
5,com.gau.go.launcherex,2013-07-17
6,com.teslacoilsw.launcher,2013-06-11
7,com.levelup.beautifulwidgets,2013-07-16
8,com.socialnmobile.dictdata.chinese.stardict,2009-10-14
9,com.mdb.android.xiangqi,2013-05-16


# Requires Version Changes -- NEWREQ

In [49]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWREQ(file_in,out_base,compress=1):
    headers = ['app_id','date','requires']
    out_filename = out_base.format('NEWREQ')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        running = {}
        nowhite = re.compile(r"\s+")
        and_up = re.compile(r" and up")
        for json in iter_json_gzip(file_in):
            requires = json.get('requires','')
            if requires == None:
                continue
            requires = requires.lower().strip()
            requires = requires.replace('varies with device','varies')
            requires = and_up.sub('+',requires)
            requires = nowhite.sub('',requires)
            if running.get(json['app_id'],'')!=requires and None !=requires:
                running[json['app_id']] = requires
                day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d')
                obs = (json['app_id'],day,requires)
                csv_writer.writerow(obs)
    return out_filename

In [50]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWREQ(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,requires
0,com.touchtype.swiftkey,2013-07-17,varies
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,2.3+
2,com.ibm.events.android.usga,2013-07-17,2.2+
3,com.dama.camera2,2013-07-17,4.0.3+
4,com.disney.scribblefree_goo,2013-07-17,2.2+
5,com.gau.go.launcherex,2013-07-17,varies
6,com.teslacoilsw.launcher,2013-07-17,4.0+
7,com.levelup.beautifulwidgets,2013-07-17,2.2+
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,1.5+
9,com.mdb.android.xiangqi,2013-07-17,1.5+


# Installs Changes -- NEWINSTALL

In [51]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWINSTALL(file_in,out_base,compress=1):
    headers = ['app_id','date','installs']
    out_filename = out_base.format('NEWINSTALL')
    
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        nowhite = re.compile(r"\s+")
        running = {}
        for json in iter_json_gzip(file_in):
            installs = nowhite.sub('',json.get('installs',''))
            if running.get(json['app_id'],'')!=installs and None != installs:
                running[json['app_id']] = installs
                day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d')
                obs = (json['app_id'],day,installs)
                csv_writer.writerow(obs)
    return out_filename

In [52]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWINSTALL(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,installs
0,com.touchtype.swiftkey,2013-07-17,"1,000,000-5,000,000"
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,"10,000,000-50,000,000"
2,com.ibm.events.android.usga,2013-07-17,"100,000-500,000"
3,com.dama.camera2,2013-07-17,"10,000-50,000"
4,com.disney.scribblefree_goo,2013-07-17,"500,000-1,000,000"
5,com.gau.go.launcherex,2013-07-17,"50,000,000-100,000,000"
6,com.teslacoilsw.launcher,2013-07-17,"5,000,000-10,000,000"
7,com.levelup.beautifulwidgets,2013-07-17,"1,000,000-5,000,000"
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,"100,000-500,000"
9,com.mdb.android.xiangqi,2013-07-17,"10,000-50,000"


# Run them all!

In [53]:
programs = """
NEWDEVELOPER
NEWCATEGORY
NEWPRICE
NEWVERSION
NEWUPDATED
NEWREQ
NEWINSTALL
DAILYRATINGS
""".split()

def run_function(f,RAW_FILE=RAW_FILE,OUT_BASE=OUT_BASE,COMPRESS_LEVEL=COMPRESS_LEVEL):
    return f(RAW_FILE ,OUT_BASE,COMPRESS_LEVEL)

for file in map(run_function,[globals()[x] for x in programs]):
    print(file)
print("DONE")

/home/cgaray/data/out/NEWDEVELOPER__google_play__main__2015_02_26_23_12.csv.gz
/home/cgaray/data/out/NEWCATEGORY__google_play__main__2015_02_26_23_12.csv.gz
/home/cgaray/data/out/NEWPRICE__google_play__main__2015_02_26_23_12.csv.gz
/home/cgaray/data/out/NEWVERSION__google_play__main__2015_02_26_23_12.csv.gz
/home/cgaray/data/out/NEWUPDATED__google_play__main__2015_02_26_23_12.csv.gz
/home/cgaray/data/out/NEWREQ__google_play__main__2015_02_26_23_12.csv.gz
/home/cgaray/data/out/NEWINSTALL__google_play__main__2015_02_26_23_12.csv.gz
/home/cgaray/data/out/DAILYRATINGS__google_play__main__2015_02_26_23_12.csv.gz
DONE
