# Initialize Environment

### Compatibility notes

The following code is only compatible with a python 3.4 kernel and the notebook must be opened with ipython 3.0+.

In [1]:
from pdb import set_trace as debug
from pandas.io.parsers import read_csv
from IPython.parallel import Client,require
from collections import Counter

In [2]:
import datetime
import re
import csv
import gzip
import ujson

In [3]:
#--> RAW_FILE_DIR with the directory that contains google_play__main.json.gz
#--> OUT_DIR with the directory of where you want the processed files to go
#--> DEBUG whether to print out created CSV files or not
#--> COMPRESS_LEVEL gzip compression level. warning: severely impacts runtime.
#--> LINE_LIMIT how many lines to iterate through in the raw file. For debugging.
DUMP_DATE = '2015_02_26_23_12'
RAW_FILE_DIR = '/data'
OUT_DIR = '/raid/out'
DEBUG = 1
COMPRESS_LEVEL = 1
LINE_LIMIT = 1000
PARALLEL = 1
PIGZ = 0

In [4]:
# Don't change these!
RAW_FILE = '{}/apple_ios__main.json.gz'.format(RAW_FILE_DIR)
OUT_BASE = OUT_DIR+'/{}__apple_ios__main__'+DUMP_DATE+'.csv.gz'

In [5]:
if PIGZ:
    @require(gzip)
    def iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT):
        return gzip.iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT)
        
else:
    @require(gzip,ujson)
    def iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT):
        with gzip.open(filename,'rt') as file_iter:
            for c,line in enumerate(file_iter):
                if c > LINE_LIMIT and LINE_LIMIT>0:
                    break
                if isinstance(line,str):
                    if len(line)>0:        
                        out = ujson.loads(line)
                        if isinstance(out,dict):
                            if 'app_id' in out and 'timestamp' in out:
                                yield out

In [6]:
# Print a couple observations to inspect daw data.
# !zcat "$RAW_FILE"|head -n 100 | tail -n2

# Initialize Parallel Computing

In [None]:
if PARALLEL:
    ipython_parallel = Client()
    print("{} active computing engines".format(len(ipython_parallel.ids)))

    lbv = ipython_parallel.load_balanced_view()

    map = lambda f,itertable:lbv.map(f,itertable,\
    block =False,\
    ordered =False)

    @require('socket')
    def host(dummy):
        return socket.gethostname()
    
    nodes = list(lbv.map(host,ipython_parallel.ids))

    print(dict(Counter(nodes).items()))

# NEWWEBSITE

In [7]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWWEBSITE(file_in,out_base,compress=1):
    headers = ['app_id','date','url']
    out_filename = out_base.format('NEWWEBSITE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        http = re.compile(r"(http://)|(https://)")
        www = re.compile(r"^www\.")
        ending = re.compile(r"\.(a-z)(.*?)^")
        bad = re.compile(r"\.[a-z]+(.*?)$")
        for json in iter_json_gzip(file_in):
            website = json.get('websiteUrl','')
            if isinstance(website,str):
                website = http.sub('',website).lower()
                website = www.sub('',website)
                website = website.split(r"/")[0]
                bad_str = bad.findall(website)
                if bad_str:
                    website = website.replace(bad_str[0],'')
                if running.get(json['app_id'],'')!=website and len(website)>0:
                    running[json['app_id']] = website
                    day = datetime.datetime.\
                    fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d').strip()
                    obs = (json['app_id'],day,website)
                    csv_writer.writerow(obs)
    return out_filename

In [8]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWWEBSITE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,url
0,477033794,2013-03-27,mehlau.net
1,465608546,2013-03-27,samratamin.com
2,333208122,2013-03-27,clickerapp.wordpress
3,564550227,2013-03-27,adictum.com
4,366947521,2013-03-27,europhonie.com
5,382827187,2013-03-27,blackstoneaudio.com
6,477679542,2013-03-27,hep-druginteractions.org
7,439268906,2013-03-27,browser-sms.com
8,449609348,2013-03-27,fox3app.com
9,421620053,2013-03-27,pocketbiblia.com


# NEWSUPPORTURL

In [9]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWSUPPORTURL(file_in,out_base,compress=1):
    headers = ['app_id','date','support_url']
    out_filename = out_base.format('NEWSUPPORTURL')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        http = re.compile(r"(http://)|(https://)")
        www = re.compile(r"^www\.")
        ending = re.compile(r"\.(a-z)(.*?)^")
        bad = re.compile(r"\.[a-z]+(.*?)$")
        for json in iter_json_gzip(file_in):
            website = json.get('supportUrl','')
            if isinstance(website,str):
                website = http.sub('',website).lower()
                website = www.sub('',website)
                website = website.split(r"/")[0]
                bad_str = bad.findall(website)
                if bad_str:
                    website = website.replace(bad_str[0],'')
                if running.get(json['app_id'],'')!=website and len(website)>0:
                    running[json['app_id']] = website
                    day = datetime.datetime.\
                    fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d').strip()
                    obs = (json['app_id'],day,website)
                    csv_writer.writerow(obs)
    return out_filename

In [10]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSUPPORTURL(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,support_url
0,477033794,2013-03-27,mehlau.net
1,494296216,2013-03-27,cardisport.com
2,465608546,2013-03-27,samratamin.com
3,498191169,2013-03-27,depiltech.com
4,333208122,2013-03-27,clickerapp.wordpress
5,447548412,2013-03-27,horizoninc.blogspot
6,568386333,2013-03-27,thedigitalnotebookcom.ning
7,564550227,2013-03-27,adictum.com
8,555915492,2013-03-27,sebast.in
9,480092632,2013-03-27,app.xinmin


# NEWSIMILAR5

In [11]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWSIMILAR5(file_in,out_base,compress=1,number_similar = 5):
    headers = ['app_id','date']+ ['related{}'.format(x) \
                                  for x in range(1,number_similar+1)]
    out_filename = out_base.format('NEWSIMILAR{}'.format(number_similar))
    running = {}
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            related = json.get('customersAlsoBoughtApps',[])
            if related:
                related = sorted(map(str, related[:number_similar]))
                joined = ",".join(related)
                if running.get(json['app_id'],'')!=joined:
                    running[json['app_id']] = joined
                    day = datetime.datetime.\
                    fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d').strip()
                    csv_writer.writerow([json['app_id'],day]+ related)
    return out_filename

In [12]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIMILAR5(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,related1,related2,related3,related4,related5
0,477033794,2013-03-27,463172573,476111536,477118369,480263826,486193487
1,494296216,2013-03-27,366078960,369126115,374898077,375665838,448951342
2,498191169,2013-03-27,437230078,495633812,495634760,497355840,502822501
3,333208122,2013-03-27,299133914,306936948,322326573,346602357,355240912
4,447548412,2013-03-27,416481779,420293260,424369605,447348181,449447049
5,564550227,2013-03-27,347920422,435068192,483708404,516961984,561743558
6,366947521,2013-03-27,284675296,288362576,290853822,364736446,364898723
7,487978078,2013-03-27,399512777,404929463,472339511,489434033,491633903
8,477679542,2013-03-27,451730311,461838768,466516740,466545022,495049076
9,523502051,2013-03-27,532084783,533680443,534949476,536288546,537851997


#NEWSIMILAR10

In [13]:
@require(NEWSIMILAR5)
def NEWSIMILAR10(RAW_FILE,OUT_BASE,COMPRESS_LEVEL):
    return NEWSIMILAR5(RAW_FILE,OUT_BASE,COMPRESS_LEVEL,10)

In [14]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIMILAR10(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,related1,related2,related3,related4,related5,related6,related7,related8,related9,related10
0,477033794,2013-03-27,455573319,463172573,469793113,473985418,474107923,476111536,477118369,477798242,480263826,486193487
1,494296216,2013-03-27,318133983,350597352,356928178,364303981,366078960,369126115,374898077,375665838,448951342,
2,498191169,2013-03-27,437230078,450464530,495633812,495634760,497355840,502822501,,,,
3,333208122,2013-03-27,299133914,306936948,322326573,325403123,325507000,330482882,343532880,346602357,348642100,355240912
4,447548412,2013-03-27,364823150,377875279,387434956,416481779,420293260,424369605,447312716,447348181,449447049,449564960
5,564550227,2013-03-27,347920422,401797164,435068192,474768200,483708404,514363426,516961984,535664708,561743558,570615299
6,366947521,2013-03-27,284675296,288362576,290853822,364736446,364898723,,,,,
7,487978078,2013-03-27,386325076,399512777,404929463,433397856,447727693,449141235,472339511,486796173,489434033,491633903
8,477679542,2013-03-27,338597829,371961189,381789014,397417517,451730311,461838768,466516740,466545022,494198191,495049076
9,523502051,2013-03-27,507769562,532084783,532663833,533680443,534949476,536288546,537062589,537851997,538445329,539983826


# NEWSIMILAR15

In [15]:
@require(NEWSIMILAR5)
def NEWSIMILAR15(RAW_FILE,OUT_BASE,COMPRESS_LEVEL):
    return NEWSIMILAR5(RAW_FILE,OUT_BASE,COMPRESS_LEVEL,15)

In [16]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIMILAR15(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,related1,related2,related3,related4,related5,related6,related7,related8,related9,related10,related11,related12,related13,related14,related15
0,477033794,2013-03-27,433576141,455573319,463172573,469793113,473237097,473985418,474107923,476111536,476134298,477118369,477798242,477917670,480263826,486193487,488756266
1,494296216,2013-03-27,318133983,350597352,356928178,364303981,366078960,369126115,374898077,375665838,448951342,,,,,,
2,498191169,2013-03-27,437230078,450464530,495633812,495634760,497355840,502822501,,,,,,,,,
3,333208122,2013-03-27,299133914,306936948,322326573,325403123,325507000,330482882,343532880,346602357,348642100,355240912,,,,,
4,447548412,2013-03-27,364197515,364823150,377631532,377875279,387434956,394578286,410521340,416481779,420293260,423778662,424369605,447312716,447348181,449447049,449564960
5,564550227,2013-03-27,347920422,369741034,401797164,403588421,405338085,435068192,456471024,474768200,481898807,483708404,514363426,516961984,535664708,561743558,570615299
6,366947521,2013-03-27,284675296,288362576,290853822,364736446,364898723,,,,,,,,,,
7,487978078,2013-03-27,339711087,386325076,399512777,404929463,411288619,419614880,433397856,438379240,447727693,449141235,455072135,472339511,486796173,489434033,491633903
8,477679542,2013-03-27,338597829,371961189,381789014,397417517,429500130,451730311,461838768,466516740,466545022,474197854,475736262,488876970,494198191,494288509,495049076
9,523502051,2013-03-27,507769562,524144163,532084783,532663833,533680443,534949476,535116158,536288546,537062589,537851997,538445329,539710081,539983826,541840574,542241980


# DAILYRATINGS

In [17]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def DAILYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','date','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('DAILYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .strftime('%Y-%m-%d')
            rating = json.get('ratingCountList',None)
            if rating:
                obs = (json['app_id'],day,tuple(rating),)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],day]+rating)
    return out_filename

In [18]:
df_debug = None
if DEBUG:
    df_debug = read_csv(DAILYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,rating5,rating4,rating3,rating2,rating1
0,333208122,2013-03-27,233,99,112,76,319
1,447548412,2013-03-27,1,1,1,0,9
2,333378940,2013-03-27,3,0,2,0,4
3,494794787,2013-03-27,3,1,1,0,5
4,469990268,2013-03-27,44,3,13,13,74
5,568421135,2013-03-27,15,5,8,10,29
6,288373421,2013-03-27,55,33,25,17,30
7,346672821,2013-03-27,4,3,7,10,44
8,353415141,2013-03-27,17,9,4,3,14
9,327519114,2013-03-27,35,11,12,4,7


# DAILYRATINGSCURRENT

In [19]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def DAILYRATINGSCURRENT(file_in,out_base,compress=1):
    headers = ['app_id','date','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('DAILYRATINGSCURRENT')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .strftime('%Y-%m-%d')
            rating = json.get('ratingCountList_current',None)
            if rating:
                obs = (json['app_id'],day,tuple(rating),)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],day]+rating)
    return out_filename

In [20]:
# Will be empty because this variable wasn't scraped for the first couple million observations
df_debug = None
if DEBUG:
    df_debug = read_csv(DAILYRATINGSCURRENT(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,rating5,rating4,rating3,rating2,rating1


# WEEKLYRATINGS

In [21]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def WEEKLYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','year','week','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('WEEKLYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            year,week = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .isocalendar()[:2]
            rating = json.get('ratingCountList',None)
            if rating:
                obs = (json['app_id'],year,week)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],year,week]+rating)
    return out_filename

In [22]:
df_debug = None
if DEBUG:
    df_debug = read_csv(WEEKLYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,year,week,rating5,rating4,rating3,rating2,rating1
0,333208122,2013,13,233,99,112,76,319
1,447548412,2013,13,1,1,1,0,9
2,333378940,2013,13,3,0,2,0,4
3,494794787,2013,13,3,1,1,0,5
4,469990268,2013,13,44,3,13,13,74
5,568421135,2013,13,15,5,8,10,29
6,288373421,2013,13,55,33,25,17,30
7,346672821,2013,13,4,3,7,10,44
8,353415141,2013,13,17,9,4,3,14
9,327519114,2013,13,35,11,12,4,7


# WEEKLYRATINGSCURRENT

In [23]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def WEEKLYRATINGSCURRENT(file_in,out_base,compress=1):
    headers = ['app_id','year','week','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('WEEKLYRATINGSCURRENT')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            year,week = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .isocalendar()[:2]
            rating = json.get('ratingCountList_current',None)
            if rating:
                obs = (json['app_id'],year,week)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],year,week]+rating)
    return out_filename

In [24]:
df_debug = None
if DEBUG:
    df_debug = read_csv(WEEKLYRATINGSCURRENT(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,year,week,rating5,rating4,rating3,rating2,rating1


# MONTHLYRATINGS

In [25]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def MONTHLYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','year','month','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('MONTHLYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            dateobj = datetime.datetime.fromtimestamp(int(json['timestamp']))
            year,month = dateobj.year,dateobj.month
            rating = json.get('ratingCountList',None)
            if rating:
                obs = (json['app_id'],year,month)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],year,month]+rating)
    return out_filename

In [26]:
df_debug = None
if DEBUG:
    df_debug = read_csv(MONTHLYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,year,month,rating5,rating4,rating3,rating2,rating1
0,333208122,2013,3,233,99,112,76,319
1,447548412,2013,3,1,1,1,0,9
2,333378940,2013,3,3,0,2,0,4
3,494794787,2013,3,3,1,1,0,5
4,469990268,2013,3,44,3,13,13,74
5,568421135,2013,3,15,5,8,10,29
6,288373421,2013,3,55,33,25,17,30
7,346672821,2013,3,4,3,7,10,44
8,353415141,2013,3,17,9,4,3,14
9,327519114,2013,3,35,11,12,4,7


# NEWDEVELOPER

In [27]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWDEVELOPER(file_in,out_base,compress=1):
    headers = ['app_id','date','dev_id']
    out_filename = out_base.format('NEWDEVELOPER')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            developer = str(json.get('artist_id',''))
            developer = developer.strip()
            if running.get(json['app_id'],'')!=developer \
                        and len(developer)>0 \
                        and developer.isdigit():
                running[json['app_id']] = developer
                day = datetime.datetime.\
                fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d').strip()
                obs = (json['app_id'],day,developer)
                csv_writer.writerow(obs)
    return out_filename

In [28]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWDEVELOPER(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,dev_id
0,477033794,2013-03-27,477033797
1,494296216,2013-03-27,494296219
2,465608546,2013-03-27,403287102
3,498191169,2013-03-27,498191172
4,333208122,2013-03-27,328282863
5,447548412,2013-03-27,431586531
6,568386333,2013-03-27,568386336
7,564550227,2013-03-27,542898284
8,555915492,2013-03-27,447033331
9,480092632,2013-03-27,414617230


# NEWCATEGORY

In [29]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWCATEGORY(file_in,out_base,compress=1):
    headers = ['app_id','date']+ ['category{}'.format(x) \
                                  for x in range(1,5)]
    out_filename = out_base.format('NEWCATEGORY')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            category = json.get('categories',[])[:4]
            joined = ",".join(category)
            if running.get(json['app_id'],'')!=joined:
                running[json['app_id']] = joined
                day = datetime.datetime.\
                fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d').strip()
                csv_writer.writerow([json['app_id'],day]+ category)
    return out_filename

In [30]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWCATEGORY(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,category1,category2,category3,category4
0,477033794,2013-03-27,Music,Utilities,,
1,494296216,2013-03-27,Sports,Education,,
2,465608546,2013-03-27,Utilities,Entertainment,,
3,498191169,2013-03-27,Medical,Health & Fitness,,
4,333208122,2013-03-27,Utilities,Education,,
5,447548412,2013-03-27,Education,Educational,Games,Kids
6,568386333,2013-03-27,Social Networking,,,
7,564550227,2013-03-27,Music,Education,,
8,555915492,2013-03-27,Games,Family,Entertainment,Kids
9,480092632,2013-03-27,Games,Kids,Family,


# NEWPRICE

In [31]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWPRICE(file_in,out_base,compress=1):
    headers = ['app_id','date','price']
    out_filename = out_base.format('NEWPRICE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            price = str(json.get('price','')).replace("$","")
            if running.get(json['app_id'],'')!=price:
                running[json['app_id']] = price
                day = datetime.datetime.\
                fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d').strip()
                obs = (json['app_id'],day,price)
                csv_writer.writerow(obs) 
    return out_filename

In [32]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWPRICE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,price
0,477033794,2013-03-27,0.00
1,494296216,2013-03-27,3.99
2,465608546,2013-03-27,5.99
3,498191169,2013-03-27,0.00
4,333208122,2013-03-27,0.00
5,447548412,2013-03-27,2.99
6,568386333,2013-03-27,0.00
7,564550227,2013-03-27,0.00
8,555915492,2013-03-27,0.99
9,480092632,2013-03-27,0.99


# ALLVERSIONS

In [33]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def ALLVERSIONS(file_in,out_base,compress=1):
    headers = ['app_id','date','version_string',\
#                'release_notes'\
              ]
    out_filename = out_base.format('ALLVERSIONS')
    unique_ver = set()
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            for each_version in json.get('version',[]):
                # need extra code because Apple changed format
                if 'release-date' in each_version:
                    date = each_version.get('release-date','')
                    date = datetime.datetime.strptime(date,'%b %d, %Y')
                    date = date.strftime('%Y-%m-%d')
                    version_string = each_version.get('version-string','')
                    release_notes = each_version.get('release-notes','')
                elif 'releaseDate' in each_version:
                    date = each_version.get('releaseDate','')
                    date = datetime.datetime.strptime(date,'%Y-%m-%dT%H:%M:%SZ')
                    date = date.strftime('%Y-%m-%d')
                    version_string = each_version.get('versionString','')
                    release_notes = each_version.get('releaseNotes','')
                obs_u = (json['app_id'],date,version_string)
                if hash(obs_u) not in unique_ver:
                    unique_ver.add(hash(obs_u))
                    obs = (json['app_id'],date,
                           version_string,
#                            release_notes,
                          )
                    csv_writer.writerow(obs)
    return out_filename

In [34]:
df_debug = None
if DEBUG:
    df_debug = read_csv(ALLVERSIONS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,version_string
0,477033794,2011-11-04,1.0
1,494296216,2012-11-06,1.8.3
2,494296216,2012-09-10,1.8.2
3,494296216,2012-02-09,1.7.1
4,494296216,2012-01-26,1.7
5,465608546,2012-06-06,1.1
6,465608546,2011-09-22,1.0
7,498191169,2012-05-23,1.2
8,498191169,2012-03-06,1.1
9,498191169,2012-02-28,1.0


# NEWREQ

In [35]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWREQ(file_in,out_base,compress=1):
    headers = ['app_id','date','requires','optimized']
    out_filename = out_base.format('NEWREQ')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file, dialect='excel')
        csv_writer.writerow(headers)
        running = {}
        and_re = re.compile(r"and ")
        com_with_re = re.compile(r"Compatible with ")
        optimize_re = re.compile(r"This app is optimized for ")
        for json in iter_json_gzip(file_in):
            requires = json.get('requirements',None)
            if requires == None:
                continue
            re_split = requires.split(".")
            requires = com_with_re.sub('',re_split[0]).strip()
            requires = and_re.sub('',requires)
            optimized = None
            if len(re_split)>1:
                optimized = re_split[1].strip()
                optimized = optimize_re.sub("",optimized)
            if running.get(json['app_id'],'')!=json.get('requirements',''):
                running[json['app_id']] = json.get('requirements','')
                day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d')
                obs = (json['app_id'],day,requires,optimized)
                csv_writer.writerow(obs)
    return out_filename

In [36]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWREQ(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,requires,optimized
0,477033794,2013-03-27,"iPhone 3GS, iPhone 4, iPhone 4S, iPhone 5, iPo...",
1,494296216,2013-03-27,"iPhone, iPod touch, iPad",
2,465608546,2013-03-27,iPad,
3,498191169,2013-03-27,"iPhone 3GS, iPhone 4, iPhone 4S, iPhone 5, iPo...",
4,333208122,2013-03-27,"iPhone, iPod touch, iPad",
5,447548412,2013-03-27,"iPhone, iPod touch, iPad",
6,568386333,2013-03-27,"iPhone, iPod touch, iPad",iPhone 5
7,564550227,2013-03-27,"iPhone, iPod touch, iPad",
8,555915492,2013-03-27,"iPhone, iPod touch, iPad",
9,480092632,2013-03-27,"iPhone, iPod touch, iPad",


# NEWNAME

In [37]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWNAME(file_in,out_base,compress=1):
    headers = ['app_id', 'date', 'name'] # list of column names
    out_filename = out_base.format('NEWNAME')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            # get some attribute, cast to strting if need be
            # what you're "remmebering" can also be a tuple (not list, they're not hashable)
            name = json.get('app_name','')
            if isinstance(name,str):
                name = name.strip()
                # double check that it's a valid field, non empty and not seen, etc.
                if running.get(json['app_id'],'')!=name and len(name)>0:
                    # SOME CODE HERE
                    running[json['app_id']] = name
                    obs = [json['app_id'], str(datetime.date.fromtimestamp(int(json.get('timestamp', '')))), name]
                    csv_writer.writerow(obs)
    return out_filename

In [38]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWNAME(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,name
0,477033794,2013-03-27,AcousticsCalc
1,494296216,2013-03-27,SportBeeper Interval 3.6s
2,465608546,2013-03-27,Ardumote HD
3,498191169,2013-03-27,Depil Tech
4,333208122,2013-03-27,Clicker Training Lite
5,447548412,2013-03-27,Multiplication !
6,568386333,2013-03-27,My Digital Notebook
7,564550227,2013-03-27,Adictum Piano Lessons Free
8,555915492,2013-03-27,Find Kitty - Hide and Seek Preschool Game
9,480092632,2013-03-27,^o^Spy Tortoise^o^


# NEWINAPP

In [53]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWINAPP(file_in,out_base,compress=1):
    headers = ['app_id', 'date', 'has_inapp'] # list of column names
    out_filename = out_base.format('NEWINAPP')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            # get some attribute, cast to strting if need be
            # what you're "remmebering" can also be a tuple (not list, they're not hashable)
            name = str(json.get('hasInAppPurchases',''))
            if isinstance(name,str):
                # double check that it's a valid field, non empty and not seen, etc.
                if running.get(json['app_id'],'')!=name and len(name)>0:
                    # SOME CODE HERE
                    running[json['app_id']] = name
                    obs = [json['app_id'], str(datetime.date.fromtimestamp(int(json.get('timestamp', '')))), name]
                    csv_writer.writerow(obs)
    return out_filename

In [54]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWINAPP(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,has_inapp
0,477033794,2013-03-27,False
1,494296216,2013-03-27,False
2,465608546,2013-03-27,False
3,498191169,2013-03-27,False
4,333208122,2013-03-27,False
5,447548412,2013-03-27,False
6,568386333,2013-03-27,False
7,564550227,2013-03-27,False
8,555915492,2013-03-27,False
9,480092632,2013-03-27,False


# DAILYSCRAPE

In [41]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def DAILYSCRAPE(file_in,out_base,compress=1):
    headers = ['app_id', 'date'] # list of column names
    out_filename = out_base.format('DAILYSCRAPE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            # get some attribute, cast to strting if need be
            # what you're "remmebering" can also be a tuple (not list, they're not hashable)
            day = str(datetime.date.fromtimestamp(json.get('timestamp','')))
            if isinstance(day,str):
                day = day.strip()
                # double check that it's a valid field, non empty and not seen, etc.
                if running.get(json['app_id'],'')!=day and len(day)>0:
                    # SOME CODE HERE
                    running[json['app_id']] = day
                    obs = [json['app_id'], day]
                    csv_writer.writerow(obs)
    return out_filename

In [42]:
df_debug = None
if DEBUG:
    df_debug = read_csv(DAILYSCRAPE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,day
0,477033794,27
1,494296216,27
2,465608546,27
3,498191169,27
4,333208122,27
5,447548412,27
6,568386333,27
7,564550227,27
8,555915492,27
9,480092632,27


# MONTHLYSCRAPE

In [43]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def MONTHLYSCRAPE(file_in,out_base,compress=1):
    headers = ['app_id', 'month'] # list of column names
    out_filename = out_base.format('MONTHLYSCRAPE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            # get some attribute, cast to strting if need be
            # what you're "remmebering" can also be a tuple (not list, they're not hashable)
            month = str(datetime.date.fromtimestamp(json.get('timestamp','')).month)
            if isinstance(month,str):
                month = month.strip()
                # double check that it's a valid field, non empty and not seen, etc.
                if running.get(json['app_id'],'')!=month and len(month)>0:
                    # SOME CODE HERE
                    running[json['app_id']] = month
                    obs = [json['app_id'], month]
                    csv_writer.writerow(obs)
    return out_filename

In [44]:
df_debug = None
if DEBUG:
    df_debug = read_csv(MONTHLYSCRAPE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,month
0,477033794,3
1,494296216,3
2,465608546,3
3,498191169,3
4,333208122,3
5,447548412,3
6,568386333,3
7,564550227,3
8,555915492,3
9,480092632,3


# WEEKLYSCRAPE

In [57]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def WEEKLYSCRAPE(file_in,out_base,compress=1):
    headers = ['app_id', 'year', 'week'] # list of column names
    out_filename = out_base.format('WEEKLYSCRAPE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            # get some attribute, cast to strting if need be
            # what you're "remmebering" can also be a tuple (not list, they're not hashable)
            date = (str(datetime.date.fromtimestamp(json.get('timestamp','')).year), str(datetime.date.fromtimestamp(json.get('timestamp','')).isocalendar()[1]))
            if isinstance(date,tuple):
                # double check that it's a valid field, non empty and not seen, etc.
                if running.get(json['app_id'],'')!=date and len(date)>0:
                    # SOME CODE HERE
                    running[json['app_id']] = date
                    obs = [json['app_id'], date[0], date[1]]
                    csv_writer.writerow(obs)
    return out_filename

In [58]:
df_debug = None
if DEBUG:
    df_debug = read_csv(WEEKLYSCRAPE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,year,week
0,477033794,2013,13
1,494296216,2013,13
2,465608546,2013,13
3,498191169,2013,13
4,333208122,2013,13
5,447548412,2013,13
6,568386333,2013,13
7,564550227,2013,13
8,555915492,2013,13
9,480092632,2013,13


# NEWLANGUAGES

In [47]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWLANGUAGES(file_in,out_base,compress=1):
    #assumes no more than 6 langs, script will still work if >6 langs but no headers on csv file for langs 7+
    headers = ['app_id', 'date', 'language1', 'language2', 'language3', 'language4', 'language5', 'language6'] # list of column names
    out_filename = out_base.format('NEWNAME')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            # get some attribute, cast to strting if need be
            # what you're "remmebering" can also be a tuple (not list, they're not hashable)
            langs = tuple(sorted(tuple(json.get('languages','').split(','))))
            if isinstance(langs,tuple):
                # double check that it's a valid field, non empty and not seen, etc.
                if running.get(json['app_id'],'')!=langs and langs != (''):
                    # SOME CODE HERE
                    running[json['app_id']] = langs
                    obs = [json['app_id'], str(datetime.date.fromtimestamp(int(json.get('timestamp', ''))))]
                    [obs.append(str(x).strip()) for x in langs]
                    csv_writer.writerow(obs)
    return out_filename

In [48]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWLANGUAGES(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,language1,language2,language3,language4,language5,language6
0,477033794,2013-03-27,,,,,,
1,494296216,2013-03-27,,,,,,
2,465608546,2013-03-27,,,,,,
3,498191169,2013-03-27,,,,,,
4,333208122,2013-03-27,,,,,,
5,447548412,2013-03-27,,,,,,
6,568386333,2013-03-27,,,,,,
7,564550227,2013-03-27,,,,,,
8,555915492,2013-03-27,,,,,,
9,480092632,2013-03-27,,,,,,


# NEWSIZE

In [55]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWSIZE(file_in,out_base,compress=1):
    headers = ['app_id', 'date', 'size'] # list of column names
    out_filename = out_base.format('NEWSIZE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            # get some attribute, cast to strting if need be
            # what you're "remmebering" can also be a tuple (not list, they're not hashable)
            size = json.get('size','')
            if isinstance(size,int):
                # double check that it's a valid field, non empty and not seen, etc.
                if running.get(json['app_id'],'')!=size and size>0:
                    # SOME CODE HERE
                    running[json['app_id']] = size
                    obs = [json['app_id'], str(datetime.date.fromtimestamp(int(json.get('timestamp', '')))), size]
                    csv_writer.writerow(obs)
    return out_filename

In [56]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIZE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,size
0,477033794,2013-03-27,101240
1,494296216,2013-03-27,2813286
2,465608546,2013-03-27,524597
3,498191169,2013-03-27,1945161
4,333208122,2013-03-27,1538564
5,447548412,2013-03-27,5933548
6,568386333,2013-03-27,7839711
7,564550227,2013-03-27,7648369
8,555915492,2013-03-27,4927664
9,480092632,2013-03-27,5181683


# Run them all!

In [21]:
programs = """
NEWNAME
NEWINAPP
DAILYSCRAPE
MONTHLYSCRAPE
WEEKLYSCRAPE
""".split()

def run_function(f,RAW_FILE=RAW_FILE,OUT_BASE=OUT_BASE,COMPRESS_LEVEL=COMPRESS_LEVEL):
    return f(RAW_FILE ,OUT_BASE,COMPRESS_LEVEL)

for file in map(run_function,[globals()[x] for x in programs]):
    print(file)

/raid/out/NEWINAPP__apple_ios__main__2015_02_26_23_12.csv.gz
/raid/out/NEWINAPP__apple_ios__main__2015_02_26_23_12.csv.gz
/raid/out/NEWNAME__apple_ios__main__2015_02_26_23_12.csv.gz
/raid/out/NEWINAPP__apple_ios__main__2015_02_26_23_12.csv.gz
/raid/out/NEWINAPP__apple_ios__main__2015_02_26_23_12.csv.gz


In [22]:
print("DONE")

DONE
