# Initialize Environment

### Compatibility notes

The following code is only compatible with a python 3.4 kernel and the notebook must be opened with ipython 3.0+.

In [20]:
from pdb import set_trace as debug
from pandas.io.parsers import read_csv
from IPython.parallel import Client,require
from collections import Counter

In [21]:
import ujson
import datetime
import re
import gzip
import csv

In [22]:
#--> RAW_FILE_DIR with the directory that contains google_play__main.json.gz
#--> OUT_DIR with the directory of where you want the processed files to go
#--> DEBUG whether to print out created CSV files or not
#--> COMPRESS_LEVEL gzip compression level. warning: severely impacts runtime.
#--> LINE_LIMIT how many lines to iterate through in the raw file. For debugging.
DUMP_DATE = '2015_02_26_23_12'
RAW_FILE_DIR = '/data'
OUT_DIR = '/raid/out'
DEBUG = 1
COMPRESS_LEVEL = 9
LINE_LIMIT = 1000
PARALLEL = 1
PIGZ = 1
OBS_SKIP = 100
OBS_PRINT = 5

In [23]:
# Don't change these!
RAW_FILE_NEWDATA = '{}/googletail.json.gz'.format(RAW_FILE_DIR)
RAW_FILE = '{}/google_play__main.json.gz'.format(RAW_FILE_DIR)
OUT_BASE = OUT_DIR+'/{}__google_play__main__'+DUMP_DATE+'.csv.gz'

In [24]:
if PIGZ:
    @require(gzip)
    def iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT):
        return gzip.iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT)    
else:
    @require(gzip,ujson)
    def iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT):
        with gzip.open(filename,'rt') as file_iter:
            for c,line in enumerate(file_iter):
                if c > LINE_LIMIT and LINE_LIMIT>0:
                    break
                if isinstance(line,str):
                    if len(line)>0:        
                        out = ujson.loads(line)
                        if isinstance(out,dict):
                            if 'app_id' in out and 'timestamp' in out:
                                yield out

# Print Sample Observations

In [25]:
try:
    from sh import zcat,head,tail
    print(tail(head(zcat(RAW_FILE_NEWDATA, _piped =True),
                    "-n{}".format(OBS_SKIP)
                   ),"-n{}".format(OBS_PRINT)))
except:
    pass

{"app_id":"com.iceberg.KuShow","ratings":[8,4,6,2,2],"updated":"February 25, 2010","installs":"1,000 - 5,000","developer_other":["com.iPhand.SexQuotes","com.iPhand.LifeQuotes","com.iPhand.LoveQuotes","com.iPhand.BizQuotes","com.GOQO.ChineseBook","com.mingfai","com.iPhand.MotivQuotes","com.iPhand.Quotes","com.iceberg.Jokes","com.mingfaiEn","com.poqop.estate","com.poqop.dolcevita","com.poqop.shangmeijia","com.poqop.DoubanGroup","com.poqop.Beauty100","com.iPhand.FirstAid"],"requires":"1.5 and up","size":"3.1M","version":"1.0.1","category":"LIFESTYLE","developer_name":"skyblue.huang","screenshots":0,"timestamp":1424945337,"description":"\u300a\u9177\u79c0\u300b\u53ef\u4ee5\u8ba9\u4f60\u968f\u5fc3\u6240\u6b32\u5730\u521b\u5efa\u4f60\u559c\u6b22\u7684\u5f62\u8c61\uff0c\u53ef\u4ee5\u4fdd\u6301\u6210\u56fe\u7247\u6587\u4ef6\u505a\u5404\u79cd\u7f51\u7edc\u793e\u533a\u7684\u5f62\u8c61\uff0c\u8fd8\u53ef\u4ee5\u8bbe\u7f6e\u4e3a\u6765\u7535\u79c0\uff0cSo Cool!\u6b64\u7248\u672c\u4e3a\u300a\u9177\u5

#Initialize Parallel Processing

In [26]:
# Configured for: equity
if PARALLEL:
    ipython_parallel = Client()
    print("{} active computing engines".format(len(ipython_parallel.ids)))

    lbv = ipython_parallel.load_balanced_view()

    map = lambda f,itertable:lbv.map(f,itertable,\
    block =False,\
    ordered =False)

    @require('socket')
    def host(dummy):
        return socket.gethostname()

    nodes = list(lbv.map(host,ipython_parallel.ids))
    nodes = [int(x.split('equity')[1].split(".")[0]) for x in nodes]
    nodes = nodes

    print(sorted(list(Counter(nodes).items())))

57 active computing engines
[(2, 3), (5, 6), (6, 6), (8, 6), (10, 6), (11, 6), (12, 4), (17, 20)]


# DAILYRATINGS

In [27]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def DAILYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','date','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('DAILYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .strftime('%Y-%m-%d')
            rating = json.get('ratings',None)
            if rating:
                obs = (json['app_id'],day,tuple(rating),)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],day]+rating)
    return out_filename

In [28]:
df_debug = None
if DEBUG:
    df_debug = read_csv(DAILYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,rating5,rating4,rating3,rating2,rating1
0,com.touchtype.swiftkey,2013-07-17,162422,33001,6799,2879,4429
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,291722,45460,13064,4439,18714
2,com.ibm.events.android.usga,2013-07-17,420,97,65,46,87
3,com.dama.camera2,2013-07-17,152,49,23,10,9
4,com.disney.scribblefree_goo,2013-07-17,1209,219,178,139,713
5,com.gau.go.launcherex,2013-07-17,1011807,233232,79681,22214,44348
6,com.teslacoilsw.launcher,2013-07-17,59107,11001,2450,764,1113
7,com.levelup.beautifulwidgets,2013-07-17,53501,15796,5546,3248,6737
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,481,253,129,36,58
9,com.mdb.android.xiangqi,2013-07-17,40,20,8,5,23


# WEEKLYRATINGS 

In [29]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def WEEKLYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','year','week','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('WEEKLYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            year,week = datetime.datetime.fromtimestamp(int(json['timestamp']))                                .isocalendar()[:2]
            rating = json.get('ratings',None)
            if rating:
                obs = (json['app_id'],year,week)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],year,week]+rating)
    return out_filename

In [30]:
df_debug = None
if DEBUG:
    df_debug = read_csv(WEEKLYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,year,week,rating5,rating4,rating3,rating2,rating1
0,com.touchtype.swiftkey,2013,29,162422,33001,6799,2879,4429
1,com.gameloft.android.ANMP.GloftDMHM,2013,29,291722,45460,13064,4439,18714
2,com.ibm.events.android.usga,2013,29,420,97,65,46,87
3,com.dama.camera2,2013,29,152,49,23,10,9
4,com.disney.scribblefree_goo,2013,29,1209,219,178,139,713
5,com.gau.go.launcherex,2013,29,1011807,233232,79681,22214,44348
6,com.teslacoilsw.launcher,2013,29,59107,11001,2450,764,1113
7,com.levelup.beautifulwidgets,2013,29,53501,15796,5546,3248,6737
8,com.socialnmobile.dictdata.chinese.stardict,2013,29,481,253,129,36,58
9,com.mdb.android.xiangqi,2013,29,40,20,8,5,23


# MONTHLYRATINGS

In [31]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def MONTHLYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','year','month','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('MONTHLYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            dateobj = datetime.datetime.fromtimestamp(int(json['timestamp']))
            year,month = dateobj.year,dateobj.month
            rating = json.get('ratings',None)
            if rating:
                obs = (json['app_id'],year,month)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],year,month]+rating)
    return out_filename

In [32]:
df_debug = None
if DEBUG:
    df_debug = read_csv(MONTHLYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,year,month,rating5,rating4,rating3,rating2,rating1
0,com.touchtype.swiftkey,2013,7,162422,33001,6799,2879,4429
1,com.gameloft.android.ANMP.GloftDMHM,2013,7,291722,45460,13064,4439,18714
2,com.ibm.events.android.usga,2013,7,420,97,65,46,87
3,com.dama.camera2,2013,7,152,49,23,10,9
4,com.disney.scribblefree_goo,2013,7,1209,219,178,139,713
5,com.gau.go.launcherex,2013,7,1011807,233232,79681,22214,44348
6,com.teslacoilsw.launcher,2013,7,59107,11001,2450,764,1113
7,com.levelup.beautifulwidgets,2013,7,53501,15796,5546,3248,6737
8,com.socialnmobile.dictdata.chinese.stardict,2013,7,481,253,129,36,58
9,com.mdb.android.xiangqi,2013,7,40,20,8,5,23


# NEWWEBSITE

In [33]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWWEBSITE(file_in,out_base,compress=1):
    headers = ['app_id','date','url']
    out_filename = out_base.format('NEWWEBSITE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        http = re.compile(r"(http://)|(https://)")
        www = re.compile(r"^www\.")
        ending = re.compile(r"\.(a-z)(.*?)^")
        bad = re.compile(r"\.[a-z]+(.*?)$")
        for json in iter_json_gzip(file_in):
            website = json.get('website','')
            if isinstance(website,str):
                website = http.sub('',website).lower()
                website = www.sub('',website)
                website = website.split(r"/")[0]
                bad_str = bad.findall(website)
                if bad_str:
                    website = website.replace(bad_str[0],'')
                if running.get(json['app_id'],'')!=website and len(website)>0:
                    running[json['app_id']] = website
                    day = datetime.datetime.\
                    fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d').strip()
                    obs = (json['app_id'],day,website)
                    csv_writer.writerow(obs)
    return out_filename

In [34]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWWEBSITE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,url
0,com.touchtype.swiftkey,2013-07-17,swiftkey.net
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,gameloft.com
2,com.ibm.events.android.usga,2013-07-17,usga.org
3,com.dama.camera2,2013-07-17,jfdplabs.com
4,com.disney.scribblefree_goo,2013-07-17,m.support
5,com.gau.go.launcherex,2013-07-17,golauncher.goforandroid
6,com.teslacoilsw.launcher,2013-07-17,novalauncher.com
7,com.levelup.beautifulwidgets,2013-07-17,support.levelupstudio
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,socialnmobile.com
9,com.mdb.android.xiangqi,2013-07-17,mdbsoftwares.blogspot


# NEWPRIVACYDOMAIN

In [35]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWPRIVACYDOMAIN(file_in,out_base,compress=1):
    headers = ['app_id','date','url']
    out_filename = out_base.format('NEWPRIVACYDOMAIN')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        http = re.compile(r"(http://)|(https://)")
        www = re.compile(r"^www\.")
        ending = re.compile(r"\.(a-z)(.*?)^")
        bad = re.compile(r"\.[a-z]+(.*?)$")
        for json in iter_json_gzip(file_in):
            website = json.get('privacy_policy','')
            if isinstance(website,str):
                website = http.sub('',website).lower()
                website = www.sub('',website)
                website = website.split(r"/")[0]
                bad_str = bad.findall(website)
                if bad_str:
                    website = website.replace(bad_str[0],'')
                if running.get(json['app_id'],'')!=website and len(website)>0:
                    running[json['app_id']] = website
                    day = datetime.datetime.\
                    fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d').strip()
                    obs = (json['app_id'],day,website)
                    csv_writer.writerow(obs)
    return out_filename

In [36]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWPRIVACYDOMAIN(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,url
0,com.touchtype.swiftkey,2013-07-17,swiftkey.net
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,gameloft.com
2,com.disney.scribblefree_goo,2013-07-17,corporate.disney
3,com.gau.go.launcherex,2013-07-17,goforandroid.com
4,com.levelup.beautifulwidgets,2013-07-17,levelupstudio.com
5,io.avocado.android,2013-07-17,avocado.io
6,flipboard.app,2013-07-17,flipboard.com
7,com.faadooengineers.electricalmeasurementinstu...,2013-07-17,faadooengineers.com
8,com.theofficialsimsmagazine.theofficialsimsmag...,2013-07-17,thesimsofficialmag.com
9,com.flyingword.LLHCen,2013-07-17,lucalashes.com


# NEWSIMILAR5

In [37]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWSIMILAR5(file_in,out_base,compress=1,number_similar = 5):
    headers = ['app_id','date']+ ['related{}'.format(x) \
                                  for x in range(1,number_similar+1)]
    out_filename = out_base.format('NEWSIMILAR{}'.format(number_similar))
    running = {}
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            related = json.get('similar_apps',[])
            if related:
                related = sorted(map(str, related[:number_similar]))
                joined = ",".join(related)
                if running.get(json['app_id'],'')!=joined:
                    running[json['app_id']] = joined
                    day = datetime.datetime.\
                    fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d').strip()
                    csv_writer.writerow([json['app_id'],day]+ related)
    return out_filename

In [38]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIMILAR5(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,related1,related2,related3,related4,related5
0,com.touchtype.swiftkey,2013-07-17,com.aitype.android,com.aitype.android.p,com.google.android.inputmethod.latin,com.nuance.swype.dtc,com.nuance.swype.trial
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,com.blockdot.universal.truffulaShuffula,com.disney.TempleRunOz.goo,com.eamobile.life_row_wf,com.nekki.vector,me.pou.app
2,com.ibm.events.android.usga,2013-07-17,com.advancedmobile.android.ghin,com.goldsteintech.android,com.ibm.events.android.usopen,com.myleaderboard.GolfChannel,com.swingbyswing
3,com.dama.camera2,2013-07-17,com.magix.camera_mx,com.neaststudios.procapture,com.oss.goodcamera2,com.superphotofull,com.zeronoiseapps.secretvideorecorderpro
4,com.disney.scribblefree_goo,2013-07-17,com.drawtube.net.angrybirds,com.flexsolution.scratchguess.logo,com.hitcents.stickmanepicfree,com.zynga.draw2.googleplay.free,quess.song.music.pop.quiz
5,com.gau.go.launcherex,2013-07-17,com.BahagHariArts.BlueNeonGoTheme,com.anddoes.launcher,com.gau.go.launcherex.theme.defaultthemethree,com.jiubang.goscreenlock,com.wed.go.launcherex.theme.plate
6,com.teslacoilsw.launcher,2013-07-17,com.anddoes.launcher,com.gau.go.launcherex.key,com.gtp.nextlauncher,com.mobint.hololauncher,org.adwfreak.launcher
7,com.levelup.beautifulwidgets,2013-07-17,cloudtv.hdwidgets,com.anddoes.fancywidgets,com.anddoes.fancywidgets.unlocker,net.hubalek.android.makeyourclock,org.zooper.zwfree
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,com.akdevelopment.dict.enchinesetrad.free,com.akdevelopment.dict.enthai.free,com.bravolol.bravolang.englishchinesecdictionary,com.dict.ec,com.pleco.chinesesystem
9,com.mdb.android.xiangqi,2013-07-17,air.com.longo.Xiangqi.android.free,com.DreamFactory.ChineseChess,com.cchess,com.tosmart.chessroad,com.xidea.ChineseDarkChess


# NEWSIMILAR10

In [39]:
@require(NEWSIMILAR5)
def NEWSIMILAR10(RAW_FILE,OUT_BASE,COMPRESS_LEVEL):
    return NEWSIMILAR5(RAW_FILE,OUT_BASE,COMPRESS_LEVEL,10)

In [40]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIMILAR10(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,related1,related2,related3,related4,related5,related6,related7,related8,related9,related10
0,com.touchtype.swiftkey,2013-07-17,com.aitype.android,com.aitype.android.p,com.beansoft.keyboardplus,com.dasur.slideit.vt.lite,com.fiberthemax.OpQ2keyboard,com.google.android.inputmethod.latin,com.jb.gokeyboard,com.nuance.swype.dtc,com.nuance.swype.trial,com.thaicomcenter.android.tswipepro
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,com.blockdot.universal.truffulaShuffula,com.disney.TempleRunOz.goo,com.eamobile.life_row_wf,com.gameloft.android.ANMP.GloftLVMK,com.imangi.templerun,com.imangi.templerun2,com.nekki.vector,com.noodlecake.happyjump,com.retrodreamer.IceCreamJump.android.free,me.pou.app
2,com.ibm.events.android.usga,2013-07-17,air.WatchESPN,com.advancedmobile.android.ghin,com.dogmac.SportsCenter,com.goldsteintech.android,com.golflogix.ui,com.ibm.events.android.usopen,com.myleaderboard.GolfChannel,com.shotzoom.golfshotgps,com.swingbyswing,com.v1.v1golf2
3,com.dama.camera2,2013-07-17,com.almalence.hdr_plus,com.kth.PuddingCamera,com.magix.camera_mx,com.neaststudios.procapture,com.oss.goodcamera2,com.superphotofull,com.zeronoiseapps.secretvideorecorderpro,jp.naver.linecamera.android,uk.co.neilandtheresa.NewVignette,uk.co.neilandtheresa.VignetteNewDemo
4,com.disney.scribblefree_goo,2013-07-17,com.drawcoolman.net.cars,com.drawmonsters.net.dogs,com.drawtube.net.angrybirds,com.ecapycsw.onetouchdrawing,com.flexsolution.scratchguess.logo,com.hitcents.stickmanepicfree,com.omgpop.dstfree,com.openmygame.games.kr.client,com.zynga.draw2.googleplay.free,quess.song.music.pop.quiz
5,com.gau.go.launcherex,2013-07-17,com.BahagHariArts.BlueNeonGoTheme,com.anddoes.launcher,com.campmobile.launcher,com.gau.go.launcherex.theme.WP7Blue,com.gau.go.launcherex.theme.defaultthemethree,com.jiubang.goscreenlock,com.spb.shell3d,com.teslacoilsw.launcher,com.wed.go.launcherex.theme.plate,org.adw.launcher
6,com.teslacoilsw.launcher,2013-07-17,com.anddoes.launcher,com.chrislacy.actionlauncher.pro,com.czarnomorski.theme.dcikonz,com.gau.go.launcherex.key,com.gtp.nextlauncher,com.mobint.hololauncher,com.mobint.hololauncher.hd,kov.theme.ics,org.adw.launcher,org.adwfreak.launcher
7,com.levelup.beautifulwidgets,2013-07-17,cloudtv.hdthemes.colourform,cloudtv.hdwidgets,com.anddoes.fancywidgets,com.anddoes.fancywidgets.unlocker,com.custom.lwp.LBeautifulSea,jmbc.timeWidget.lite,jp.co.vibe.weatherskylivewallpaper,net.hubalek.android.makeyourclock,org.zooper.zwfree,sk.michalec.SimpleDigiClockWidget
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,com.akdevelopment.dict.enchinesesimpl.free,com.akdevelopment.dict.enchinesetrad.free,com.akdevelopment.dict.enthai.free,com.bravolol.bravolang.englishchinesecdictionary,com.dict.ec,com.dictionary,com.dreamob.android.encndict,com.embermitre.hanping.app,com.pleco.chinesesystem,socialnmobile.dictdata.bible.dictionary
9,com.mdb.android.xiangqi,2013-07-17,air.com.longo.Xiangqi.android.free,chschess.chess,com.DreamFactory.ChineseChess,com.aichess.googlechess,com.alonsoruibal.chessdroid.lite,com.cchess,com.google.android.chess,com.tosmart.chessroad,com.vndynapp.cotuong,com.xidea.ChineseDarkChess


# NEWSIMILAR15

In [41]:
@require(NEWSIMILAR5)
def NEWSIMILAR15(RAW_FILE,OUT_BASE,COMPRESS_LEVEL):
    return NEWSIMILAR5(RAW_FILE,OUT_BASE,COMPRESS_LEVEL,15)

In [42]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIMILAR15(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,related1,related2,related3,related4,related5,related6,related7,related8,related9,related10,related11,related12,related13,related14,related15
0,com.touchtype.swiftkey,2013-07-17,com.aitype.android,com.aitype.android.p,com.aitype.android.tablet,com.beansoft.keyboardplus,com.cootek.smartinputv5,com.dasur.slideit,com.dasur.slideit.vt.lite,com.fiberthemax.OpQ2keyboard,com.google.android.inputmethod.latin,com.jb.gokeyboard,com.nuance.swype.dtc,com.nuance.swype.trial,com.thaicomcenter.android.tswipepro,inputmethod.latin.perfectkeyboard,net.cdeguet.smartkeyboardpro.ko
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,com.blockdot.universal.truffulaShuffula,com.byril.battleship,com.disney.TempleRunOz.goo,com.ea.game.simpsons4_row,com.eamobile.life_row_wf,com.gameloft.android.ANMP.GloftLVMK,com.imangi.templerun,com.imangi.templerun2,com.ludia.familyfeudandfriends.free,com.nekki.vector,com.noodlecake.happyjump,com.retrodreamer.IceCreamJump.android.free,me.pou.app,no.dirtybit.funrun,org.orangenose.games
2,com.ibm.events.android.usga,2013-07-17,air.WatchESPN,com.advancedmobile.android.ghin,com.bskyb.sportnews,com.dogmac.SportsCenter,com.fivemobile.thescore,com.goldsteintech.android,com.golflogix.ui,com.ibm.events.android.masters,com.ibm.events.android.usopen,com.mobilefootie.fotmobpro,com.myleaderboard.GolfChannel,com.protrade.sportacular,com.shotzoom.golfshotgps,com.swingbyswing,com.v1.v1golf2
3,com.dama.camera2,2013-07-17,com.almalence.hdr_plus,com.flavionet.android.camera.lite,com.kth.PuddingCamera,com.magix.camera_mx,com.neaststudios.procapture,com.nhn.android.ncamera,com.oss.goodcamera2,com.oss.mcam,com.superphotofull,com.wheadon.photoenhancepro,com.zeronoiseapps.secretvideorecorderpro,jp.naver.linecamera.android,uk.co.neilandtheresa.NewVignette,uk.co.neilandtheresa.VignetteNewDemo,vStudio.Android.Camera360
4,com.disney.scribblefree_goo,2013-07-17,com.drawcoolman.net.cars,com.drawmonsters.net.dogs,com.drawmonsters.net.zombies,com.drawtube.net.angrybirds,com.ecapycsw.onetouchdrawing,com.flexsolution.scratchguess.logo,com.hitcents.stickmanepic,com.hitcents.stickmanepicfree,com.omgpop.dstfree,com.openmygame.games.kr.client,com.scoompa.dstassist,com.timeplusq.drawnguess,com.zynga.draw2.googleplay.free,quess.song.music.pop.quiz,tipitap.coloring.phones
5,com.gau.go.launcherex,2013-07-17,com.BahagHariArts.BlueNeonGoTheme,com.SolarSpark.DotLockerLite,com.anddoes.launcher,com.campmobile.launcher,com.gau.go.launcherex.key,com.gau.go.launcherex.theme.WP7Blue,com.gau.go.launcherex.theme.defaultthemethree,com.jiubang.goscreenlock,com.mobint.hololauncher,com.spb.shell3d,com.teslacoilsw.launcher,com.wed.go.launcherex.theme.plate,org.adw.launcher,org.adwfreak.launcher,theme.go.launcherex.theme.pink.butterfly
6,com.teslacoilsw.launcher,2013-07-17,cloudtv.hdwidgets,com.anddoes.launcher,com.campmobile.launcher,com.chrislacy.actionlauncher.pro,com.czarnomorski.theme.dcikonz,com.dlto.atom.launcher,com.gau.go.launcherex,com.gau.go.launcherex.key,com.gtp.nextlauncher,com.mobint.hololauncher,com.mobint.hololauncher.hd,com.photox.jbhd,kov.theme.ics,org.adw.launcher,org.adwfreak.launcher
7,com.levelup.beautifulwidgets,2013-07-17,cloudtv.hdthemes.colourform,cloudtv.hdwidgets,com.anddoes.fancywidgets,com.anddoes.fancywidgets.unlocker,com.custom.lwp.LBeautifulSea,com.gtp.nextlauncher,com.joko.paperlandpro,com.tomanyz.lockWatchLight,fishnoodle.silhouette_free,jmbc.timeWidget.lite,jp.co.vibe.weatherskylivewallpaper,net.hubalek.android.makeyourclock,nl.jsource.retroclock.android,org.zooper.zwfree,sk.michalec.SimpleDigiClockWidget
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,com.akdevelopment.dict.enchinesesimpl.free,com.akdevelopment.dict.enchinesetrad.free,com.akdevelopment.dict.enthai.free,com.bravolol.bravolang.englishchinesecdictionary,com.dict.ec,com.dictionary,com.dreamob.android.encndict,com.embermitre.hanping.app,com.embermitre.hanping.app.lite,com.movinapp.dict.enzh.free,com.pleco.chinesesystem,fr.nghs.android.dictionnaires,livio.pack.lang.en_US,socialnmobile.dictdata.bible.dictionary,socialnmobile.dictdata.english.irregular
9,com.mdb.android.xiangqi,2013-07-17,air.com.longo.Xiangqi.android.free,chschess.chess,cn.voilet.chinesechess,com.DreamFactory.ChineseChess,com.aichess.googlechess,com.alonsoruibal.chessdroid.lite,com.cchess,com.game.chinesechessfree,com.google.android.chess,com.jns.game.socialchinesechess.free,com.mobirix.chess.wgmf,com.tosmart.chessroad,com.vndynapp.cotuong,com.xidea.ChineseDarkChess,com.zingmagic.chinesechessvfree


# NEWDEVELOPER

In [43]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWDEVELOPER(file_in,out_base,compress=1):
    headers = ['app_id','date','dev_id']
    out_filename = out_base.format('NEWDEVELOPER')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            developer = json.get('developer_id','')
            if isinstance(developer,str):
                developer = developer.strip()
                if running.get(json['app_id'],'')!=developer and len(developer)>0:
                    running[json['app_id']] = developer
                    day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d')
                    obs = (json['app_id'],day,developer)
                    csv_writer.writerow(obs) 
    return out_filename

In [44]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWDEVELOPER(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,dev_id
0,com.touchtype.swiftkey,2013-07-17,SwiftKey
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,Gameloft
2,com.ibm.events.android.usga,2013-07-17,"United+States+Golf+Association,+USGA"
3,com.dama.camera2,2013-07-17,JFDP+Labs
4,com.disney.scribblefree_goo,2013-07-17,Disney
5,com.gau.go.launcherex,2013-07-17,GO+Launcher+Dev+Team
6,com.teslacoilsw.launcher,2013-07-17,TeslaCoil+Software
7,com.levelup.beautifulwidgets,2013-07-17,LevelUp+Studio
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,Notes
9,com.mdb.android.xiangqi,2013-07-17,MDB+Softwares


# NEWDEVELOPERNAME

In [45]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWDEVELOPERNAME(file_in,out_base,compress=1):
    headers = ['app_id','date','dev_name']
    out_filename = out_base.format('NEWDEVELOPERNAME')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            developer = json.get('developer_name','')
            if isinstance(developer,str):
                developer = developer.strip()
                if running.get(json['app_id'],'')!=developer and len(developer)>0:
                    running[json['app_id']] = developer
                    day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d')
                    obs = (json['app_id'],day,developer)
                    csv_writer.writerow(obs) 
    return out_filename

In [46]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWDEVELOPERNAME(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,dev_name
0,com.touchtype.swiftkey,2013-07-17,SwiftKey
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,Gameloft
2,com.ibm.events.android.usga,2013-07-17,"United States Golf Association, USGA"
3,com.dama.camera2,2013-07-17,JFDP Labs
4,com.disney.scribblefree_goo,2013-07-17,Disney
5,com.gau.go.launcherex,2013-07-17,GO Launcher Dev Team
6,com.teslacoilsw.launcher,2013-07-17,TeslaCoil Software
7,com.levelup.beautifulwidgets,2013-07-17,LevelUp Studio
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,Notes
9,com.mdb.android.xiangqi,2013-07-17,MDB Softwares


# NEWCATEGORY

In [47]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWCATEGORY(file_in,out_base,compress=1):
    headers = ['app_id','date','category']
    out_filename = out_base.format('NEWCATEGORY')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            category = json.get('category','')
            if isinstance(category,str):
                category = category.strip().upper()
                if running.get(json['app_id'],'')!=category and len(category)>0:
                    running[json['app_id']] = category
                    day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d')
                    obs = (json['app_id'],day,category)
                    csv_writer.writerow(obs)
    return out_filename

In [48]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWCATEGORY(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,category
0,com.touchtype.swiftkey,2013-07-17,PRODUCTIVITY
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,CASUAL
2,com.ibm.events.android.usga,2013-07-17,SPORTS
3,com.dama.camera2,2013-07-17,PHOTOGRAPHY
4,com.disney.scribblefree_goo,2013-07-17,BRAIN
5,com.gau.go.launcherex,2013-07-17,PERSONALIZATION
6,com.teslacoilsw.launcher,2013-07-17,PERSONALIZATION
7,com.levelup.beautifulwidgets,2013-07-17,PERSONALIZATION
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,BOOKS_AND_REFERENCE
9,com.mdb.android.xiangqi,2013-07-17,BRAIN


# NEWPRICE

In [49]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWPRICE(file_in,out_base,compress=1):
    headers = ['app_id','date','price']
    out_filename = out_base.format('NEWPRICE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            price = str(json.get('price',''))
            if price =='0.00':
                price = '0'
            if running.get(json['app_id'],'')!=price and price != '.' and len(price)>0:
                running[json['app_id']] = price
                day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d')
                obs = (json['app_id'],day,price)
                csv_writer.writerow(obs)
    return out_filename

In [50]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWPRICE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,price
0,com.touchtype.swiftkey,2013-07-17,3.99
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,0.00
2,com.ibm.events.android.usga,2013-07-17,0.00
3,com.dama.camera2,2013-07-17,2.99
4,com.disney.scribblefree_goo,2013-07-17,0.00
5,com.gau.go.launcherex,2013-07-17,0.00
6,com.teslacoilsw.launcher,2013-07-17,0.00
7,com.levelup.beautifulwidgets,2013-07-17,2.59
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,0.00
9,com.mdb.android.xiangqi,2013-07-17,0.00


# NEWVERSION

In [51]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWVERSION(file_in,out_base,compress=1):
    headers = ['app_id','date','version']
    out_filename = out_base.format('NEWVERSION')
    with gzip.open(out_filename,'wt',compress,) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        running = {}
        nowhite = re.compile(r"\s+")
        for json in iter_json_gzip(file_in):
            version = json.get('version','')
            if isinstance(version,str):
                version = version.strip()
                if len(version)>0:
                    version = version.replace('Varies with device','varies')
                    version = nowhite.sub('',version).lower()
                    if running.get(json['app_id'],'')!=version and len(version)>0:
                        running[json['app_id']] = version
                        day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .strftime('%Y-%m-%d')
                        obs = (json['app_id'],day,version)
                        csv_writer.writerow(obs)
    return out_filename

In [52]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWVERSION(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,version
0,com.touchtype.swiftkey,2013-07-17,varies
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,1.0.0
2,com.ibm.events.android.usga,2013-07-17,2.2
3,com.dama.camera2,2013-07-17,1.0.6
4,com.disney.scribblefree_goo,2013-07-17,1.0.2
5,com.gau.go.launcherex,2013-07-17,varies
6,com.teslacoilsw.launcher,2013-07-17,2.1.1
7,com.levelup.beautifulwidgets,2013-07-17,5.3.1
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,1.0.4
9,com.mdb.android.xiangqi,2013-07-17,2.6.12


# NEWUPDATED

In [53]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWUPDATED(file_in,out_base,compress=1):
    headers = ['app_id','updated',]
    out_filename = out_base.format('NEWUPDATED')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        running = {}
        nowhite = re.compile(r"\s+")
        for json in iter_json_gzip(file_in):
            updated = json.get('updated','')
            if isinstance(updated,str):
                updated = updated.strip()
                if running.get(json['app_id'],'')!=updated and len(updated)>0:
                    running[json['app_id']] = updated
                    date = datetime.datetime.strptime(updated,'%B %d, %Y')
                    date = date.strftime('%Y-%m-%d')
                    obs = (json['app_id'],date)
                    csv_writer.writerow(obs)
    return out_filename

In [54]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWUPDATED(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,updated
0,com.touchtype.swiftkey,2013-07-01
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-16
2,com.ibm.events.android.usga,2013-06-14
3,com.dama.camera2,2013-07-03
4,com.disney.scribblefree_goo,2013-07-03
5,com.gau.go.launcherex,2013-07-17
6,com.teslacoilsw.launcher,2013-06-11
7,com.levelup.beautifulwidgets,2013-07-16
8,com.socialnmobile.dictdata.chinese.stardict,2009-10-14
9,com.mdb.android.xiangqi,2013-05-16


# NEWREQ

In [55]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWREQ(file_in,out_base,compress=1):
    headers = ['app_id','date','requires']
    out_filename = out_base.format('NEWREQ')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        running = {}
        nowhite = re.compile(r"\s+")
        and_up = re.compile(r" and up")
        for json in iter_json_gzip(file_in):
            requires = json.get('requires','')
            if isinstance(requires,str):
                if len(requires)>0:
                    requires = requires.lower().strip()
                    requires = requires.replace('varies with device','varies')
                    requires = and_up.sub('+',requires)
                    requires = nowhite.sub('',requires)
                    if running.get(json['app_id'],'')!=requires and len(requires)>0:
                        running[json['app_id']] = requires
                        day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .strftime('%Y-%m-%d')
                        obs = (json['app_id'],day,requires)
                        csv_writer.writerow(obs)
    return out_filename

In [56]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWREQ(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,requires
0,com.touchtype.swiftkey,2013-07-17,varies
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,2.3+
2,com.ibm.events.android.usga,2013-07-17,2.2+
3,com.dama.camera2,2013-07-17,4.0.3+
4,com.disney.scribblefree_goo,2013-07-17,2.2+
5,com.gau.go.launcherex,2013-07-17,varies
6,com.teslacoilsw.launcher,2013-07-17,4.0+
7,com.levelup.beautifulwidgets,2013-07-17,2.2+
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,1.5+
9,com.mdb.android.xiangqi,2013-07-17,1.5+


# NEWINSTALL

In [57]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWINSTALL(file_in,out_base,compress=1):
    headers = ['app_id','date','installs']
    out_filename = out_base.format('NEWINSTALL')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        nowhite = re.compile(r"\s+")
        running = {}
        for json in iter_json_gzip(file_in):
            installs = json.get('installs','')
            if isinstance(installs,str):
                installs = nowhite.sub('',installs)
                if running.get(json['app_id'],'')!=installs and len(installs)>0:
                    running[json['app_id']] = installs
                    day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d')
                    obs = (json['app_id'],day,installs)
                    csv_writer.writerow(obs)
    return out_filename

In [58]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWINSTALL(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,installs
0,com.touchtype.swiftkey,2013-07-17,"1,000,000-5,000,000"
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,"10,000,000-50,000,000"
2,com.ibm.events.android.usga,2013-07-17,"100,000-500,000"
3,com.dama.camera2,2013-07-17,"10,000-50,000"
4,com.disney.scribblefree_goo,2013-07-17,"500,000-1,000,000"
5,com.gau.go.launcherex,2013-07-17,"50,000,000-100,000,000"
6,com.teslacoilsw.launcher,2013-07-17,"5,000,000-10,000,000"
7,com.levelup.beautifulwidgets,2013-07-17,"1,000,000-5,000,000"
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,"100,000-500,000"
9,com.mdb.android.xiangqi,2013-07-17,"10,000-50,000"


# NEWNAME

In [59]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWNAME(file_in,out_base,compress=1):
    headers = ['app_id', 'date', 'name'] # list of column names
    out_filename = out_base.format('NEWNAME')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            name = json.get('app_name','')
            if isinstance(name,str):
                name = name.strip()
                if running.get(json['app_id'],'')!=name and len(name)>0:
                    running[json['app_id']] = name
                    obs = [json['app_id'], str(datetime.date.fromtimestamp(int(json.get('timestamp', '')))), name]
                    csv_writer.writerow(obs)
    return out_filename

In [60]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWNAME(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,name
0,com.touchtype.swiftkey,2013-07-17,SwiftKey Keyboard
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,Despicable Me
2,com.ibm.events.android.usga,2013-07-17,U.S. Open Golf Championship
3,com.dama.camera2,2013-07-17,Camera 2
4,com.disney.scribblefree_goo,2013-07-17,ScribbleMix
5,com.gau.go.launcherex,2013-07-17,GO Launcher EX
6,com.teslacoilsw.launcher,2013-07-17,Nova Launcher
7,com.levelup.beautifulwidgets,2013-07-17,Beautiful Widgets Pro
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,English Chinese Dictionary 辞典
9,com.mdb.android.xiangqi,2013-07-17,中国象棋 : The chinese chess


# NEWINAPP

In [61]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWINAPP(file_in,out_base,compress=1):
    headers = ['app_id', 'date', 'has_inapp'] # list of column names
    out_filename = out_base.format('NEWINAPP')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            if int(json.get('timestamp')) < 1402901976: #June 15 2014 11:59 PM
                continue
            name = str(json.get('in_app', ''))
            if name.isdigit():
                name = str(bool(int(name)))
            else:
                name = ''
            if isinstance(name,str):
                if running.get(json['app_id'],'')!=name and len(name)>0:
                    running[json['app_id']] = name
                    obs = [json['app_id'], str(datetime.date.fromtimestamp(int(json.get('timestamp', '')))), name]
                    csv_writer.writerow(obs)
    return out_filename

In [62]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWINAPP(RAW_FILE_NEWDATA,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,has_inapp
0,com.icbs.catchme.english,2015-02-26,True
1,com.iceflakestudios.piraterun,2015-02-26,True
2,com.iccwct202012.iccworldt202012,2015-02-26,True
3,com.iceapps.animalpictures,2015-02-26,True
4,com.iceberg.flippyjet,2015-02-26,True
5,com.iceberg.testdrivecar2,2015-02-26,True
6,com.iceberg.testdrivecar3,2015-02-26,True
7,com.iceberg.testdrivecar4,2015-02-26,True
8,com.iceberg.testdrivecar6,2015-02-26,True
9,com.iceberg.testdrivecar5,2015-02-26,True


# DAILYSCRAPE

In [63]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def DAILYSCRAPE(file_in,out_base,compress=1):
    headers = ['app_id', 'date'] # list of column names
    out_filename = out_base.format('DAILYSCRAPE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            day = str(datetime.date.fromtimestamp(json.get('timestamp','')))
            if isinstance(day,str):
                day = day.strip()
                if running.get(json['app_id'],'')!=day and len(day)>0:
                    running[json['app_id']] = day
                    obs = [json['app_id'], day]
                    csv_writer.writerow(obs)
    return out_filename

In [64]:
df_debug = None
if DEBUG:
    df_debug = read_csv(DAILYSCRAPE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date
0,com.touchtype.swiftkey,2013-07-17
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17
2,com.ibm.events.android.usga,2013-07-17
3,com.dama.camera2,2013-07-17
4,com.disney.scribblefree_goo,2013-07-17
5,com.gau.go.launcherex,2013-07-17
6,com.teslacoilsw.launcher,2013-07-17
7,com.levelup.beautifulwidgets,2013-07-17
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17
9,com.mdb.android.xiangqi,2013-07-17


# MONTHLYSCRAPE

In [65]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def MONTHLYSCRAPE(file_in,out_base,compress=1):
    headers = ['app_id', 'year', 'month'] # list of column names
    out_filename = out_base.format('MONTHLYSCRAPE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            month = (str(datetime.date.fromtimestamp(json.get('timestamp','')).month).strip(), str(datetime.date.fromtimestamp(json.get('timestamp','')).year).strip())
            if isinstance(month,tuple):
                if running.get(json['app_id'],'')!=month and len(month)>0:
                    running[json['app_id']] = month
                    obs = [json['app_id'], month[1], month[0]]
                    csv_writer.writerow(obs)
    return out_filename

In [66]:
df_debug = None
if DEBUG:
    df_debug = read_csv(MONTHLYSCRAPE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,year,month
0,com.touchtype.swiftkey,2013,7
1,com.gameloft.android.ANMP.GloftDMHM,2013,7
2,com.ibm.events.android.usga,2013,7
3,com.dama.camera2,2013,7
4,com.disney.scribblefree_goo,2013,7
5,com.gau.go.launcherex,2013,7
6,com.teslacoilsw.launcher,2013,7
7,com.levelup.beautifulwidgets,2013,7
8,com.socialnmobile.dictdata.chinese.stardict,2013,7
9,com.mdb.android.xiangqi,2013,7


# NEWSIZE

In [67]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWSIZE(file_in,out_base,compress=1):
    headers = ['app_id', 'date', 'size'] # list of column names
    out_filename = out_base.format('NEWSIZE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            size = json.get('size','').lower()
            size = size.replace('varies with device','varies')
            if isinstance(size,str):
                size = size.strip()
                if running.get(json['app_id'],'')!=size and len(size)>0:
                    running[json['app_id']] = size
                    obs = [json['app_id'], str(datetime.date.fromtimestamp(int(json.get('timestamp', '')))), size]
                    csv_writer.writerow(obs)
    return out_filename

In [68]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIZE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,size
0,com.touchtype.swiftkey,2013-07-17,varies
1,com.gameloft.android.ANMP.GloftDMHM,2013-07-17,11m
2,com.ibm.events.android.usga,2013-07-17,11m
3,com.dama.camera2,2013-07-17,5.1m
4,com.disney.scribblefree_goo,2013-07-17,16m
5,com.gau.go.launcherex,2013-07-17,varies
6,com.teslacoilsw.launcher,2013-07-17,2.9m
7,com.levelup.beautifulwidgets,2013-07-17,12m
8,com.socialnmobile.dictdata.chinese.stardict,2013-07-17,1.1m
9,com.mdb.android.xiangqi,2013-07-17,1.0m


# WEEKLYSCRAPE

In [69]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def WEEKLYSCRAPE(file_in,out_base,compress=1):
    headers = ['app_id', 'year', 'week'] # list of column names
    out_filename = out_base.format('WEEKLYSCRAPE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            date = (str(datetime.date.fromtimestamp(json.get('timestamp','')).year), str(datetime.date.fromtimestamp(json.get('timestamp','')).isocalendar()[1]))
            if isinstance(date,tuple):
                if running.get(json['app_id'],'')!=date and len(date)>0:
                    running[json['app_id']] = date
                    obs = [json['app_id'], date[0], date[1]]
                    csv_writer.writerow(obs)
    return out_filename

In [70]:
df_debug = None
if DEBUG:
    df_debug = read_csv(WEEKLYSCRAPE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,year,week
0,com.touchtype.swiftkey,2013,29
1,com.gameloft.android.ANMP.GloftDMHM,2013,29
2,com.ibm.events.android.usga,2013,29
3,com.dama.camera2,2013,29
4,com.disney.scribblefree_goo,2013,29
5,com.gau.go.launcherex,2013,29
6,com.teslacoilsw.launcher,2013,29
7,com.levelup.beautifulwidgets,2013,29
8,com.socialnmobile.dictdata.chinese.stardict,2013,29
9,com.mdb.android.xiangqi,2013,29


#SNAPSHOTQUARTERLY

In [None]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def SNAPSHOTQUARTERLY(file_in,out_base,compress=1):
    headers = ['app_id', 'quarter'] # list of column names
    out_filename = out_base.format('SNAPSHOTQUARTERLY')
    running = {}
    year = datetime.date.fromtimestamp(json.get('timestamp', '')).year
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            date = (str(datetime.date.fromtimestamp(json.get('timestamp','')).year), str(datetime.date.fromtimestamp(json.get('timestamp','')).isocalendar()[1]))
            quarter = str(date.year)+'-Q'+str(date.month/4+1)
            if isinstance(quarter, str):
                if running.get(json['app_id'],'')!=quarter and len(quarter)>0:
                    running[json['app_id']] = quarter
                    obs = [json['app_id'], quarter]
                    csv_writer.writerow(obs)
    return out_filename

In [None]:
df_debug = None
if DEBUG:
    df_debug = read_csv(SNAPSHOTQUARTERLY(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# Run them all!

In [71]:
print("starting processing Google Data Request")

starting processing Google Data Request


In [72]:
specific_tables = """
NEWNAME
NEWINAPP
DAILYSCRAPE
MONTHLYSCRAPE
WEEKLYSCRAPE
""".split()

all_tables = [x for x in globals() 
              if callable(globals()[x]) 
              and x.isupper()]
all_tables

['NEWUPDATED',
 'MONTHLYSCRAPE',
 'WEEKLYRATINGS',
 'NEWSIMILAR5',
 'NEWPRIVACYDOMAIN',
 'DAILYSCRAPE',
 'NEWCATEGORY',
 'NEWSIZE',
 'NEWWEBSITE',
 'NEWPRICE',
 'WEEKLYSCRAPE',
 'NEWREQ',
 'NEWINSTALL',
 'NEWDEVELOPERNAME',
 'DAILYRATINGS',
 'MONTHLYRATINGS',
 'NEWINAPP',
 'NEWVERSION',
 'NEWDEVELOPER',
 'NEWNAME',
 'NEWSIMILAR10',
 'NEWSIMILAR15']

In [73]:
def run_function(f,RAW_FILE=RAW_FILE,OUT_BASE=OUT_BASE,COMPRESS_LEVEL=COMPRESS_LEVEL):
    return f(RAW_FILE ,OUT_BASE,COMPRESS_LEVEL)

for i,file in enumerate(map(run_function,[globals()[x] for x in all_tables])):
    print(i+1,file)

1 /home/cgaray/data/out/MONTHLYRATINGS__google_play__main__2015_02_26_23_12.csv.gz
2 /home/cgaray/data/out/NEWDEVELOPERNAME__google_play__main__2015_02_26_23_12.csv.gz
3 /home/cgaray/data/out/NEWINAPP__google_play__main__2015_02_26_23_12.csv.gz
4 /home/cgaray/data/out/NEWSIMILAR5__google_play__main__2015_02_26_23_12.csv.gz
5 /home/cgaray/data/out/NEWCATEGORY__google_play__main__2015_02_26_23_12.csv.gz
6 /home/cgaray/data/out/NEWINSTALL__google_play__main__2015_02_26_23_12.csv.gz
7 /home/cgaray/data/out/WEEKLYSCRAPE__google_play__main__2015_02_26_23_12.csv.gz
8 /home/cgaray/data/out/NEWREQ__google_play__main__2015_02_26_23_12.csv.gz
9 /home/cgaray/data/out/MONTHLYSCRAPE__google_play__main__2015_02_26_23_12.csv.gz
10 /home/cgaray/data/out/NEWPRIVACYDOMAIN__google_play__main__2015_02_26_23_12.csv.gz
11 /home/cgaray/data/out/NEWUPDATED__google_play__main__2015_02_26_23_12.csv.gz
12 /home/cgaray/data/out/NEWDEVELOPER__google_play__main__2015_02_26_23_12.csv.gz
13 /home/cgaray/data/out/NEWVE

In [74]:
print("DONE")

DONE


In [75]:
print(len(all_tables))

22
