# Initialize Environment

### Compatibility notes

The following code is only compatible with a python 3.4 kernel and the notebook must be opened with ipython 3.0+.

In [1]:
from pdb import set_trace as debug
from pandas.io.parsers import read_csv
import pandas
from IPython.parallel import Client,require
from collections import Counter

In [2]:
import datetime
import re
import csv
import gzip
import ujson

In [12]:
#--> RAW_FILE_DIR with the directory that contains google_play__main.json.gz
#--> OUT_DIR with the directory of where you want the processed files to go
#--> DEBUG whether to print out created CSV files or not
#--> COMPRESS_LEVEL gzip compression level. warning: severely impacts runtime.
#--> LINE_LIMIT how many lines to iterate through in the raw file. For debugging.
DUMP_DATE = '2015_02_26_23_12'
RAW_FILE_DIR = '/data'
OUT_DIR = '/raid/out'
DEBUG = 1
COMPRESS_LEVEL = 9
LINE_LIMIT = 1000
PARALLEL = 1
PIGZ = 0
OBS_SKIP = 100
OBS_PRINT = 5

In [4]:
# Don't change these!
RAW_FILE_NEWDATA = '{}/appletail.json.gz'.format(RAW_FILE_DIR)
RAW_FILE = '{}/apple_ios__main.json.gz'.format(RAW_FILE_DIR)
OUT_BASE = OUT_DIR+'/{}__apple_ios__main__'+DUMP_DATE+'.csv.gz'

In [13]:
if PIGZ:
    @require(gzip)
    def iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT):
        return gzip.iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT)    
else:
    @require(gzip,ujson)
    def iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT):
        with gzip.open(filename,'rt') as file_iter:
            for c,line in enumerate(file_iter):
                if c > LINE_LIMIT and LINE_LIMIT>0:
                    break
                if isinstance(line,str):
                    if len(line)>0:        
                        out = ujson.loads(line)
                        if isinstance(out,dict):
                            if 'app_id' in out and 'timestamp' in out:
                                yield out

# Print Sample Observations

In [6]:
try:
    from sh import zcat,head,tail
    print(tail(head(zcat(RAW_FILE_NEWDATA, _piped =True),
                    "-n{}".format(OBS_SKIP)
                   ),"-n{}".format(OBS_PRINT)))
except:
    pass

# Initialize Parallel Computing

In [7]:
# Configured for: equity
if PARALLEL:
    ipython_parallel = Client()
    print("{} active computing engines".format(len(ipython_parallel.ids)))

    lbv = ipython_parallel.load_balanced_view()

    map = lambda f,itertable:lbv.map(f,itertable,\
    block =False,\
    ordered =False)

    @require('socket')
    def host(dummy):
        return socket.gethostname()

    nodes = list(lbv.map(host,ipython_parallel.ids))
    nodes = [int(x.split('equity')[1].split(".")[0]) for x in nodes]
    nodes = nodes

    print(sorted(list(Counter(nodes).items())))

57 active computing engines
[(2, 3), (5, 5), (6, 6), (8, 6), (10, 9), (11, 7), (12, 4), (17, 17)]


# NEWWEBSITE

In [8]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWWEBSITE(file_in,out_base,compress=1):
    headers = ['app_id','date','url']
    out_filename = out_base.format('NEWWEBSITE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        http = re.compile(r"(http://)|(https://)")
        www = re.compile(r"^www\.")
        ending = re.compile(r"\.(a-z)(.*?)^")
        bad = re.compile(r"\.[a-z]+(.*?)$")
        for json in iter_json_gzip(file_in):
            website = json.get('websiteUrl','')
            if isinstance(website,str):
                website = http.sub('',website).lower()
                website = www.sub('',website)
                website = website.split(r"/")[0]
                bad_str = bad.findall(website)
                if bad_str:
                    website = website.replace(bad_str[0],'')
                if running.get(json['app_id'],'')!=website and len(website)>0:
                    running[json['app_id']] = website
                    day = datetime.datetime.\
                    fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d').strip()
                    obs = (json['app_id'],day,website)
                    csv_writer.writerow(obs)
    return out_filename

In [9]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWWEBSITE(RAW_FILE_NEWDATA,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,url
0,554943499,2015-02-19,appstore.liv
1,626090799,2015-02-19,triposo.com
2,554946247,2015-02-19,themurdocks.com
3,554961300,2015-02-19,pocketmags.com
4,554948318,2015-02-19,childrenbibleapp.com
5,554949951,2015-02-19,gig-bar.at
6,626042451,2015-02-19,2billiard.com
7,554948564,2015-02-19,inetnose.com
8,554963282,2015-02-19,palet.dk
9,554950152,2015-02-19,childrenbibleapp.com


# NEWSUPPORTURL

In [10]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWSUPPORTURL(file_in,out_base,compress=1):
    headers = ['app_id','date','support_url']
    out_filename = out_base.format('NEWSUPPORTURL')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        http = re.compile(r"(http://)|(https://)")
        www = re.compile(r"^www\.")
        ending = re.compile(r"\.(a-z)(.*?)^")
        bad = re.compile(r"\.[a-z]+(.*?)$")
        for json in iter_json_gzip(file_in):
            website = json.get('supportUrl','')
            if isinstance(website,str):
                website = http.sub('',website).lower()
                website = www.sub('',website)
                website = website.split(r"/")[0]
                bad_str = bad.findall(website)
                if bad_str:
                    website = website.replace(bad_str[0],'')
                if running.get(json['app_id'],'')!=website and len(website)>0:
                    running[json['app_id']] = website
                    day = datetime.datetime.\
                    fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d').strip()
                    obs = (json['app_id'],day,website)
                    csv_writer.writerow(obs)
    return out_filename

In [11]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSUPPORTURL(RAW_FILE_NEWDATA,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,support_url
0,554951624,2015-02-19,smc.seoul
1,554950974,2015-02-19,pauluz.com
2,554943499,2015-02-19,appstore.liv
3,554955624,2015-02-19,markspaeth.com
4,626127894,2015-02-19,rmadrid.onfocus
5,554948858,2015-02-19,eon.com
6,626090799,2015-02-19,triposo.com
7,554964569,2015-02-19,zeberka.pl
8,554936718,2015-02-19,treelev.com
9,554946247,2015-02-19,ambar.org


# NEWSIMILAR5

In [12]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWSIMILAR5(file_in,out_base,compress=1,number_similar = 5):
    headers = ['app_id','date']+ ['related{}'.format(x) \
                                  for x in range(1,number_similar+1)]
    out_filename = out_base.format('NEWSIMILAR{}'.format(number_similar))
    running = {}
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            related = json.get('customersAlsoBoughtApps',[])
            if related:
                related = sorted(map(str, related[:number_similar]))
                joined = ",".join(related)
                if running.get(json['app_id'],'')!=joined:
                    running[json['app_id']] = joined
                    day = datetime.datetime.\
                    fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d').strip()
                    csv_writer.writerow([json['app_id'],day]+ related)
    return out_filename

In [13]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIMILAR5(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,related1,related2,related3,related4,related5
0,281656475,2010-09-06,284939601,284947616,353410847,363658120,364298872
1,281736535,2010-09-06,284939601,284947616,361536763,363658120,363998989
2,281861187,2010-09-06,284449213,286948844,293647966,340141710,364907966
3,281889893,2010-09-06,284947616,361536763,363592836,363658120,363998989
4,281893011,2010-09-06,284939601,284947616,361536763,363658120,363799703
5,281962101,2010-09-06,363205965,364763214,364883734,364894339,365149275
6,282737873,2010-09-06,287958963,289039914,363479668,364683781,364821057
7,282738621,2010-09-06,290008445,302498559,363507998,364733950,364904019
8,282750724,2010-09-06,284980812,289814055,294190055,312759829,363205965
9,282758413,2010-09-06,284947616,288720707,296083171,359914600,363448914


#NEWSIMILAR10

In [14]:
@require(NEWSIMILAR5)
def NEWSIMILAR10(RAW_FILE,OUT_BASE,COMPRESS_LEVEL):
    return NEWSIMILAR5(RAW_FILE,OUT_BASE,COMPRESS_LEVEL,10)

In [15]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIMILAR10(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,related1,related2,related3,related4,related5,related6,related7,related8,related9,related10
0,281656475,2010-09-06,284939601,284947616,353410847,363658120,364298872,,,,,
1,281736535,2010-09-06,284939601,284947616,361536763,363658120,363998989,,,,,
2,281861187,2010-09-06,284449213,286948844,293647966,340141710,364907966,,,,,
3,281889893,2010-09-06,284947616,361536763,363592836,363658120,363998989,,,,,
4,281893011,2010-09-06,284939601,284947616,361536763,363658120,363799703,,,,,
5,281962101,2010-09-06,363205965,364763214,364883734,364894339,365149275,,,,,
6,282737873,2010-09-06,287958963,289039914,363479668,364683781,364821057,,,,,
7,282738621,2010-09-06,290008445,302498559,363507998,364733950,364904019,,,,,
8,282750724,2010-09-06,284980812,289814055,294190055,312759829,363205965,,,,,
9,282758413,2010-09-06,284947616,288720707,296083171,359914600,363448914,,,,,


# NEWSIMILAR15

In [16]:
@require(NEWSIMILAR5)
def NEWSIMILAR15(RAW_FILE,OUT_BASE,COMPRESS_LEVEL):
    return NEWSIMILAR5(RAW_FILE,OUT_BASE,COMPRESS_LEVEL,15)

In [17]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIMILAR15(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,related1,related2,related3,related4,related5,related6,related7,related8,related9,related10,related11,related12,related13,related14,related15
0,281656475,2010-09-06,284939601,284947616,353410847,363658120,364298872,,,,,,,,,,
1,281736535,2010-09-06,284939601,284947616,361536763,363658120,363998989,,,,,,,,,,
2,281861187,2010-09-06,284449213,286948844,293647966,340141710,364907966,,,,,,,,,,
3,281889893,2010-09-06,284947616,361536763,363592836,363658120,363998989,,,,,,,,,,
4,281893011,2010-09-06,284939601,284947616,361536763,363658120,363799703,,,,,,,,,,
5,281962101,2010-09-06,363205965,364763214,364883734,364894339,365149275,,,,,,,,,,
6,282737873,2010-09-06,287958963,289039914,363479668,364683781,364821057,,,,,,,,,,
7,282738621,2010-09-06,290008445,302498559,363507998,364733950,364904019,,,,,,,,,,
8,282750724,2010-09-06,284980812,289814055,294190055,312759829,363205965,,,,,,,,,,
9,282758413,2010-09-06,284947616,288720707,296083171,359914600,363448914,,,,,,,,,,


# DAILYRATINGS

In [18]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def DAILYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','date','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('DAILYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .strftime('%Y-%m-%d')
            rating = json.get('ratingCountList',None)
            if rating:
                obs = (json['app_id'],day,tuple(rating),)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],day]+rating)
    return out_filename

In [19]:
df_debug = None
if DEBUG:
    df_debug = read_csv(DAILYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,rating5,rating4,rating3,rating2,rating1
0,281656475,2010-09-06,725,381,592,594,2099
1,281704574,2010-09-06,74643,33583,62135,59975,193223
2,281736535,2010-09-06,4116,3426,6104,6230,11533
3,281790044,2010-09-06,15693,10780,9632,4290,6877
4,281796108,2010-09-06,12044,7815,9851,7073,11630
5,281816692,2010-09-06,3125,2558,3050,1923,2771
6,281826146,2010-09-06,2657,665,764,599,1384
7,281861187,2010-09-06,18,13,20,21,35
8,281889893,2010-09-06,430,310,427,270,732
9,281893011,2010-09-06,959,1118,2010,1827,2391


# DAILYRATINGSCURRENT

In [20]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def DAILYRATINGSCURRENT(file_in,out_base,compress=1):
    headers = ['app_id','date','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('DAILYRATINGSCURRENT')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .strftime('%Y-%m-%d')
            rating = json.get('ratingCountList_current',None)
            if rating:
                obs = (json['app_id'],day,tuple(rating),)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],day]+rating)
    return out_filename

In [21]:
# Will be empty because this variable wasn't scraped for the first couple million observations
df_debug = None
if DEBUG:
    df_debug = read_csv(DAILYRATINGSCURRENT(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,rating5,rating4,rating3,rating2,rating1
0,281656475,2010-09-06,136,82,116,139,434
1,281704574,2010-09-06,643,227,419,445,2088
2,281736535,2010-09-06,68,50,93,102,235
3,281790044,2010-09-06,84,41,63,42,76
4,281796108,2010-09-06,47,26,31,25,55
5,281816692,2010-09-06,38,22,45,41,62
6,281826146,2010-09-06,125,36,23,29,61
7,281861187,2010-09-06,2,0,4,6,5
8,281889893,2010-09-06,32,5,14,6,38
9,281893011,2010-09-06,5,3,7,5,16


# WEEKLYRATINGS

In [22]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def WEEKLYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','year','week','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('WEEKLYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            year,week = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .isocalendar()[:2]
            rating = json.get('ratingCountList',None)
            if rating:
                obs = (json['app_id'],year,week)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],year,week]+rating)
    return out_filename

In [23]:
df_debug = None
if DEBUG:
    df_debug = read_csv(WEEKLYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,year,week,rating5,rating4,rating3,rating2,rating1
0,281656475,2010,36,725,381,592,594,2099
1,281704574,2010,36,74643,33583,62135,59975,193223
2,281736535,2010,36,4116,3426,6104,6230,11533
3,281790044,2010,36,15693,10780,9632,4290,6877
4,281796108,2010,36,12044,7815,9851,7073,11630
5,281816692,2010,36,3125,2558,3050,1923,2771
6,281826146,2010,36,2657,665,764,599,1384
7,281861187,2010,36,18,13,20,21,35
8,281889893,2010,36,430,310,427,270,732
9,281893011,2010,36,959,1118,2010,1827,2391


# WEEKLYRATINGSCURRENT

In [24]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def WEEKLYRATINGSCURRENT(file_in,out_base,compress=1):
    headers = ['app_id','year','week','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('WEEKLYRATINGSCURRENT')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            year,week = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .isocalendar()[:2]
            rating = json.get('ratingCountList_current',None)
            if rating:
                obs = (json['app_id'],year,week)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],year,week]+rating)
    return out_filename

In [25]:
df_debug = None
if DEBUG:
    df_debug = read_csv(WEEKLYRATINGSCURRENT(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,year,week,rating5,rating4,rating3,rating2,rating1
0,281656475,2010,36,136,82,116,139,434
1,281704574,2010,36,643,227,419,445,2088
2,281736535,2010,36,68,50,93,102,235
3,281790044,2010,36,84,41,63,42,76
4,281796108,2010,36,47,26,31,25,55
5,281816692,2010,36,38,22,45,41,62
6,281826146,2010,36,125,36,23,29,61
7,281861187,2010,36,2,0,4,6,5
8,281889893,2010,36,32,5,14,6,38
9,281893011,2010,36,5,3,7,5,16


# MONTHLYRATINGS

In [26]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def MONTHLYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','year','month','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('MONTHLYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            dateobj = datetime.datetime.fromtimestamp(int(json['timestamp']))
            year,month = dateobj.year,dateobj.month
            rating = json.get('ratingCountList',None)
            if rating:
                obs = (json['app_id'],year,month)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],year,month]+rating)
    return out_filename

In [27]:
df_debug = None
if DEBUG:
    df_debug = read_csv(MONTHLYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,year,month,rating5,rating4,rating3,rating2,rating1
0,281656475,2010,9,725,381,592,594,2099
1,281704574,2010,9,74643,33583,62135,59975,193223
2,281736535,2010,9,4116,3426,6104,6230,11533
3,281790044,2010,9,15693,10780,9632,4290,6877
4,281796108,2010,9,12044,7815,9851,7073,11630
5,281816692,2010,9,3125,2558,3050,1923,2771
6,281826146,2010,9,2657,665,764,599,1384
7,281861187,2010,9,18,13,20,21,35
8,281889893,2010,9,430,310,427,270,732
9,281893011,2010,9,959,1118,2010,1827,2391


# NEWDEVELOPER

In [28]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWDEVELOPER(file_in,out_base,compress=1):
    headers = ['app_id','date','dev_id']
    out_filename = out_base.format('NEWDEVELOPER')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            developer = str(json.get('artist_id',''))
            developer = developer.strip()
            if running.get(json['app_id'],'')!=developer \
                        and len(developer)>0 \
                        and developer.isdigit():
                running[json['app_id']] = developer
                day = datetime.datetime.\
                fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d').strip()
                obs = (json['app_id'],day,developer)
                csv_writer.writerow(obs)
    return out_filename

In [29]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWDEVELOPER(RAW_FILE_NEWDATA,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,dev_id
0,554951624,2015-02-19,444447999
1,554950974,2015-02-19,554950977
2,554943499,2015-02-19,497327246
3,554955624,2015-02-19,865392107
4,626127894,2015-02-19,502370580
5,554948858,2015-02-19,419138166
6,626090799,2015-02-19,415611465
7,554964569,2015-02-19,554964572
8,554936718,2015-02-19,432324275
9,554946247,2015-02-19,367064753


# NEWCATEGORY

In [30]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWCATEGORY(file_in,out_base,compress=1):
    headers = ['app_id','date']+ ['category{}'.format(x) \
                                  for x in range(1,5)]
    out_filename = out_base.format('NEWCATEGORY')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            category = json.get('categories',[])[:4]
            joined = ",".join(category)
            if running.get(json['app_id'],'')!=joined:
                running[json['app_id']] = joined
                day = datetime.datetime.\
                fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d').strip()
                csv_writer.writerow([json['app_id'],day]+ category)
    return out_filename

In [31]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWCATEGORY(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,category1,category2,category3,category4
0,281656475,2010-09-06,Games,,,
1,281704574,2010-09-06,Social Networking,,,
2,281736535,2010-09-06,Games,,,
3,281790044,2010-09-06,Navigation,,,
4,281796108,2010-09-06,Productivity,,,
5,281816692,2010-09-06,Weather,,,
6,281826146,2010-09-06,Business,,,
7,281861187,2010-09-06,Games,,,
8,281889893,2010-09-06,Games,,,
9,281893011,2010-09-06,Games,,,


# NEWPRICE

In [32]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWPRICE(file_in,out_base,compress=1):
    headers = ['app_id','date','price']
    out_filename = out_base.format('NEWPRICE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            price = str(json.get('price','')).replace("$","")
            if running.get(json['app_id'],'')!=price:
                running[json['app_id']] = price
                day = datetime.datetime.\
                fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d').strip()
                obs = (json['app_id'],day,price)
                csv_writer.writerow(obs) 
    return out_filename

In [33]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWPRICE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,price
0,281656475,2010-09-06,0.99
1,281704574,2010-09-06,0.00
2,281736535,2010-09-06,2.99
3,281790044,2010-09-06,0.00
4,281796108,2010-09-06,0.00
5,281816692,2010-09-06,0.00
6,281826146,2010-09-06,0.00
7,281861187,2010-09-06,1.99
8,281889893,2010-09-06,7.99
9,281893011,2010-09-06,7.99


# ALLVERSIONS

In [34]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def ALLVERSIONS(file_in,out_base,compress=1):
    headers = ['app_id','date','version_string',\
#                'release_notes'\
              ]
    out_filename = out_base.format('ALLVERSIONS')
    unique_ver = set()
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            for each_version in json.get('version',[]):
                # need extra code because Apple changed format
                if 'release-date' in each_version:
                    date = each_version.get('release-date','')
                    date = datetime.datetime.strptime(date,'%b %d, %Y')
                    date = date.strftime('%Y-%m-%d')
                    version_string = each_version.get('version-string','')
                    release_notes = each_version.get('release-notes','')
                elif 'releaseDate' in each_version:
                    date = each_version.get('releaseDate','')
                    date = datetime.datetime.strptime(date,'%Y-%m-%dT%H:%M:%SZ')
                    date = date.strftime('%Y-%m-%d')
                    version_string = each_version.get('versionString','')
                    release_notes = each_version.get('releaseNotes','')
                obs_u = (json['app_id'],date,version_string)
                if hash(obs_u) not in unique_ver:
                    unique_ver.add(hash(obs_u))
                    obs = (json['app_id'],date,
                           version_string,
#                            release_notes,
                          )
                    csv_writer.writerow(obs)
    return out_filename

In [35]:
df_debug = None
if DEBUG:
    df_debug = read_csv(ALLVERSIONS(RAW_FILE_NEWDATA,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,version_string
0,554951624,2015-01-13,1.1
1,554951624,2013-02-15,1.0
2,554950974,2012-09-18,1.1
3,554943499,2013-08-29,1.3
4,554943499,2013-08-14,1.2
5,554943499,2013-01-21,1.1
6,554943499,2012-09-07,1.0
7,554955624,2013-06-08,1.4
8,554955624,2012-09-08,1.2
9,626127894,2013-04-01,1.0


# NEWREQ

In [36]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWREQ(file_in,out_base,compress=1):
    headers = ['app_id','date','requires','optimized']
    out_filename = out_base.format('NEWREQ')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file, dialect='excel')
        csv_writer.writerow(headers)
        running = {}
        and_re = re.compile(r"and ")
        com_with_re = re.compile(r"Compatible with ")
        optimize_re = re.compile(r"This app is optimized for ")
        for json in iter_json_gzip(file_in):
            requires = json.get('requirements',None)
            if requires == None:
                continue
            re_split = requires.split(".")
            requires = com_with_re.sub('',re_split[0]).strip()
            requires = and_re.sub('',requires)
            optimized = None
            if len(re_split)>1:
                optimized = re_split[1].strip()
                optimized = optimize_re.sub("",optimized)
            if running.get(json['app_id'],'')!=json.get('requirements',''):
                running[json['app_id']] = json.get('requirements','')
                day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d')
                obs = (json['app_id'],day,requires,optimized)
                csv_writer.writerow(obs)
    return out_filename

In [37]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWREQ(RAW_FILE_NEWDATA,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,requires,optimized
0,554951624,2015-02-19,Requires iOS 5,1 or later
1,554950974,2015-02-19,Requires iOS 4,2 or later
2,554943499,2015-02-19,Requires iOS 4,3 or later
3,554955624,2015-02-19,Requires iOS 4,3 or later
4,626127894,2015-02-19,Requires iOS 5,0 or later
5,554948858,2015-02-19,Requires iOS 5,0 or later
6,626090799,2015-02-19,Requires iOS 6,0 or later
7,554964569,2015-02-19,Requires iOS 6,0 or later
8,554936718,2015-02-19,Requires iOS 4,3 or later
9,554946247,2015-02-19,Requires iOS 5,1


# NEWNAME

In [38]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWNAME(file_in,out_base,compress=1):
    headers = ['app_id', 'date', 'name'] # list of column names
    out_filename = out_base.format('NEWNAME')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            name = json.get('app_name','')
            if isinstance(name,str):
                name = name.strip()
                if running.get(json['app_id'],'')!=name and len(name)>0:
                    running[json['app_id']] = name
                    obs = [json['app_id'], str(datetime.date.fromtimestamp(int(json.get('timestamp', '')))), name]
                    csv_writer.writerow(obs)
    return out_filename

In [39]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWNAME(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,name
0,281656475,2010-09-06,PAC-MAN
1,281704574,2010-09-06,AIM (Free Edition)
2,281736535,2010-09-06,Enigmo
3,281790044,2010-09-06,WHERE
4,281796108,2010-09-06,Evernote
5,281816692,2010-09-06,Pocket ExpressÂ®
6,281826146,2010-09-06,Salesforce Mobile
7,281861187,2010-09-06,nikoli SUDOKU Vol.01
8,281889893,2010-09-06,Bomberman Touch - The Legend of Mystic Bomb
9,281893011,2010-09-06,Aqua Forest


# NEWINAPP

In [40]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWINAPP(file_in,out_base,compress=1):
    headers = ['app_id', 'date', 'has_inapp'] # list of column names
    out_filename = out_base.format('NEWINAPP')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            name = json.get('addOns',None)
            if name != None and running.get('app_id',0) != name:
                running[json['app_id']] = True
                obs = [json['app_id'], str(datetime.date.fromtimestamp(int(json.get('timestamp', '')))), True]
                csv_writer.writerow(obs)
    return out_filename

In [41]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWINAPP(RAW_FILE_NEWDATA,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,has_inapp
0,626090799,2015-02-19,True
1,554961300,2015-02-19,True
2,554948898,2015-02-19,True
3,554948318,2015-02-19,True
4,554950152,2015-02-19,True
5,554961300,2015-02-19,True
6,554948898,2015-02-19,True
7,625998068,2015-02-19,True
8,625987338,2015-02-19,True
9,626092889,2015-02-19,True


# DAILYSCRAPE

In [42]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def DAILYSCRAPE(file_in,out_base,compress=1):
    headers = ['app_id', 'date'] # list of column names
    out_filename = out_base.format('DAILYSCRAPE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            day = str(datetime.date.fromtimestamp(json.get('timestamp','')))
            if isinstance(day,str):
                day = day.strip()
                if running.get(json['app_id'],'')!=day and len(day)>0:
                    running[json['app_id']] = day
                    obs = [json['app_id'], day]
                    csv_writer.writerow(obs)
    return out_filename

In [43]:
df_debug = None
if DEBUG:
    df_debug = read_csv(DAILYSCRAPE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date
0,281656475,2010-09-06
1,281704574,2010-09-06
2,281736535,2010-09-06
3,281790044,2010-09-06
4,281796108,2010-09-06
5,281816692,2010-09-06
6,281826146,2010-09-06
7,281861187,2010-09-06
8,281889893,2010-09-06
9,281893011,2010-09-06


# MONTHLYSCRAPE

In [44]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def MONTHLYSCRAPE(file_in,out_base,compress=1):
    headers = ['app_id', 'year', 'month'] # list of column names
    out_filename = out_base.format('MONTHLYSCRAPE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            date = (str(datetime.date.fromtimestamp(json.get('timestamp','')).month), str(datetime.date.fromtimestamp(json.get('timestamp','')).year))
            if isinstance(date,tuple):
                if running.get(json['app_id'],'')!=date and len(date)>0:
                    running[json['app_id']] = date
                    obs = [json['app_id'], date[1], date[0]]
                    csv_writer.writerow(obs)
    return out_filename

In [45]:
df_debug = None
if DEBUG:
    df_debug = read_csv(MONTHLYSCRAPE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,year,month
0,281656475,2010,9
1,281704574,2010,9
2,281736535,2010,9
3,281790044,2010,9
4,281796108,2010,9
5,281816692,2010,9
6,281826146,2010,9
7,281861187,2010,9
8,281889893,2010,9
9,281893011,2010,9


# WEEKLYSCRAPE

In [46]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def WEEKLYSCRAPE(file_in,out_base,compress=1):
    headers = ['app_id', 'year', 'week'] # list of column names
    out_filename = out_base.format('WEEKLYSCRAPE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            date = (str(datetime.date.fromtimestamp(json.get('timestamp','')).year), str(datetime.date.fromtimestamp(json.get('timestamp','')).isocalendar()[1]))
            if isinstance(date,tuple):
                if running.get(json['app_id'],'')!=date and len(date)>0:
                    running[json['app_id']] = date
                    obs = [json['app_id'], date[0], date[1]]
                    csv_writer.writerow(obs)
    return out_filename

In [47]:
df_debug = None
if DEBUG:
    df_debug = read_csv(WEEKLYSCRAPE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,year,week
0,281656475,2010,36
1,281704574,2010,36
2,281736535,2010,36
3,281790044,2010,36
4,281796108,2010,36
5,281816692,2010,36
6,281826146,2010,36
7,281861187,2010,36
8,281889893,2010,36
9,281893011,2010,36


# NEWLANGUAGES

In [48]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWLANGUAGES(file_in,out_base,compress=1):
    #assumes no more than 6 langs, script will still work if >6 langs but no headers on csv file for langs 7+
    headers = ['app_id', 'date'] # list of column names
    headers += ["language_{}".format(x) for x in range(1,41)]
    out_filename = out_base.format('NEWLANGUAGES')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            langs = json.get('languages',None)
            if not langs:
                continue
            langs = tuple(sorted(tuple(str(langs).split(','))))
            if isinstance(langs,tuple):
                if running.get(json['app_id'],'')!=langs and langs != ('',) and len(langs) > 0:
                    running[json['app_id']] = langs
                    obs = [json['app_id'], str(datetime.date.fromtimestamp(int(json.get('timestamp', ''))))]
                    [obs.append(str(x).strip()) for x in langs]
                    csv_writer.writerow(obs)
    return out_filename

In [49]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWLANGUAGES(RAW_FILE_NEWDATA,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,language_1,language_2,language_3,language_4,language_5,language_6,language_7,language_8,...,language_31,language_32,language_33,language_34,language_35,language_36,language_37,language_38,language_39,language_40
0,554951624,2015-02-19,Japanese,Korean,English,,,,,,...,,,,,,,,,,
1,554950974,2015-02-19,German,Northern Sami,Spanish,English,,,,,...,,,,,,,,,,
2,554943499,2015-02-19,English,,,,,,,,...,,,,,,,,,,
3,554955624,2015-02-19,English,,,,,,,,...,,,,,,,,,,
4,626127894,2015-02-19,English,,,,,,,,...,,,,,,,,,,
5,554948858,2015-02-19,German,English,,,,,,,...,,,,,,,,,,
6,626090799,2015-02-19,Simplified Chinese,En,,,,,,,...,,,,,,,,,,
7,554964569,2015-02-19,Polish,,,,,,,,...,,,,,,,,,,
8,554936718,2015-02-19,Russian,English,,,,,,,...,,,,,,,,,,
9,554946247,2015-02-19,English,,,,,,,,...,,,,,,,,,,


# NEWSIZE

In [50]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWSIZE(file_in,out_base,compress=1):
    headers = ['app_id', 'date', 'size'] # list of column names
    out_filename = out_base.format('NEWSIZE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            size = json.get('size','')
            if isinstance(size,int):
                if running.get(json['app_id'],'')!=size and size>0:
                    running[json['app_id']] = size
                    obs = [json['app_id'], str(datetime.date.fromtimestamp(int(json.get('timestamp', '')))), size]
                    csv_writer.writerow(obs)
    return out_filename

In [51]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIZE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,size
0,281656475,2010-09-06,3500000
1,281704574,2010-09-06,5700000
2,281736535,2010-09-06,4700000
3,281790044,2010-09-06,8000000
4,281796108,2010-09-06,7900000
5,281816692,2010-09-06,4200000
6,281826146,2010-09-06,2100000
7,281861187,2010-09-06,2800000
8,281889893,2010-09-06,12600000
9,281893011,2010-09-06,6800000


# NEWARTIST

In [52]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWARTIST(file_in,out_base,compress=1):
    headers = ['app_id','date','artist_name']
    out_filename = out_base.format('NEWARTIST')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            developer = json.get('artistName','')
            if running.get(json['app_id'],'')!=developer:
                running[json['app_id']] = developer
                day = datetime.datetime.\
                fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d').strip()
                obs = (json['app_id'],day,developer)
                csv_writer.writerow(obs)
    return out_filename

In [53]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWARTIST(RAW_FILE_NEWDATA,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,artist_name
0,554951624,2015-02-19,서울 특별시
1,554950974,2015-02-19,Paul Merks
2,554943499,2015-02-19,University of Liverpool
3,554955624,2015-02-19,Lilo Peter
4,626127894,2015-02-19,Yijee
5,554948858,2015-02-19,E.ON Energy Limited
6,626090799,2015-02-19,Triposo Inc
7,554964569,2015-02-19,Marquard Media Polska
8,554936718,2015-02-19,TreeLev LLC
9,554946247,2015-02-19,American Bar Association


# NEWREVIEW 

In [54]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWREVIEW(file_in,out_base,compress=1):
    headers = ['app_id','date','total_reviews']
    out_filename = out_base.format('NEWREVIEW')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            total_reviews = str(json.get('totalNumberOfReviews',''))
            if running.get(json['app_id'],'')!= total_reviews:
                running[json['app_id']] = total_reviews
                day = datetime.datetime.\
                fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d').strip()
                obs = (json['app_id'],day,total_reviews)
                csv_writer.writerow(obs)
    return out_filename

In [55]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWREVIEW(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,total_reviews
0,281656475,2010-09-06,446
1,281704574,2010-09-06,12689
2,281736535,2010-09-06,2862
3,281790044,2010-09-06,1464
4,281796108,2010-09-06,1548
5,281816692,2010-09-06,934
6,281826146,2010-09-06,222
7,281861187,2010-09-06,15
8,281889893,2010-09-06,198
9,281893011,2010-09-06,698


# NEWSELLER 

In [56]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def NEWSELLER(file_in,out_base,compress=1):
    headers = ['app_id','date','seller']
    out_filename = out_base.format('NEWSELLER')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            developer = '{}'.format(json['seller'])
            if running.get(json['app_id'],None)!=developer:
                running[json['app_id']] = developer
                day = datetime.datetime.\
                fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d').strip()
                obs = (json['app_id'],day,developer)
                csv_writer.writerow(obs)
    return out_filename

In [57]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSELLER(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,seller
0,281656475,2010-09-06,Namco Networks America
1,281704574,2010-09-06,AOL
2,281736535,2010-09-06,Pangea Software Inc.
3,281790044,2010-09-06,WHERE Inc.
4,281796108,2010-09-06,Evernote
5,281816692,2010-09-06,Handmark Inc.
6,281826146,2010-09-06,salesforce.com
7,281861187,2010-09-06,Hudson Entertainment
8,281889893,2010-09-06,Hudson Entertainment
9,281893011,2010-09-06,Hudson Entertainment


# RELEASEDATE

In [58]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def RELEASEDATE(file_in,out_base,compress=1):
    headers = ['app_id', 'release_date'] # list of column names
    out_filename = out_base.format('RELEASEDATE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            release_date = json.get('releaseDate','')
            if isinstance(release_date,str):
                if (not running.get(json['app_id'],'')) and len(release_date)>0:
                    obs = [json['app_id'], release_date]
                    csv_writer.writerow(obs)
    return out_filename

In [59]:
df_debug = None
if DEBUG:
    df_debug = read_csv(RELEASEDATE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,release_date
0,282750724,2008-07-15
1,283189656,2008-08-10
2,283277455,2009-08-12
3,283487349,2008-07-09
4,283854851,2008-07-09
5,283916003,2008-07-11
6,284117359,2008-07-09
7,284150420,2008-12-18
8,284151234,2008-07-09
9,284152154,2008-07-09


#SNAPSHOTQUARTERLY

In [16]:
@require(iter_json_gzip,gzip,datetime,re,csv)
def SNAPSHOTQUARTERLY(file_in,out_base,compress=1):
    headers = ['app_id', 'quarter'] # list of column names
    out_filename = out_base.format('SNAPSHOTQUARTERLY')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            date = datetime.date.fromtimestamp(json.get('timestamp',''))
            quarter = str(date.year) + '-Q' + str((int(date.month/4)) + 1)
            if isinstance(quarter,str):
                if running.get(json['app_id'],'')!=quarter and len(quarter)>0:
                    running[json['app_id']] = quarter
                    obs = [json['app_id'], quarter]
                    csv_writer.writerow(obs)
    return out_filename

In [17]:
df_debug = None
if DEBUG:
    df_debug = read_csv(SNAPSHOTQUARTERLY(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,quarter
0,477033794,2013-Q1
1,494296216,2013-Q1
2,465608546,2013-Q1
3,498191169,2013-Q1
4,333208122,2013-Q1
5,447548412,2013-Q1
6,568386333,2013-Q1
7,564550227,2013-Q1
8,555915492,2013-Q1
9,480092632,2013-Q1


# Run them all!

In [60]:
print("starting processing Apple Data Request")

starting processing Apple Data Request


In [61]:
specific_tables = """
NEWNAME
NEWINAPP
DAILYSCRAPE
MONTHLYSCRAPE
WEEKLYSCRAPE
""".split()

all_tables = [x for x in globals() 
              if callable(globals()[x]) 
              and x.isupper()]
all_tables

['NEWREVIEW',
 'NEWSIMILAR10',
 'NEWINAPP',
 'NEWSUPPORTURL',
 'WEEKLYRATINGSCURRENT',
 'NEWCATEGORY',
 'NEWWEBSITE',
 'ALLVERSIONS',
 'DAILYRATINGSCURRENT',
 'MONTHLYRATINGS',
 'NEWREQ',
 'WEEKLYRATINGS',
 'NEWDEVELOPER',
 'NEWSELLER',
 'NEWNAME',
 'DAILYRATINGS',
 'WEEKLYSCRAPE',
 'NEWARTIST',
 'NEWSIMILAR5',
 'RELEASEDATE',
 'NEWPRICE',
 'NEWSIZE',
 'NEWSIMILAR15',
 'NEWLANGUAGES',
 'MONTHLYSCRAPE',
 'DAILYSCRAPE']

In [62]:
def run_function(f,RAW_FILE=RAW_FILE,OUT_BASE=OUT_BASE,COMPRESS_LEVEL=COMPRESS_LEVEL):
    return f(RAW_FILE ,OUT_BASE,COMPRESS_LEVEL)

for i,file in enumerate(map(run_function,[globals()[x] for x in all_tables])):
    print(i+1,file)

1 /home/cgaray/data/out/NEWREVIEW__apple_ios__main__2015_02_26_23_12.csv.gz
2 /home/cgaray/data/out/NEWSIMILAR10__apple_ios__main__2015_02_26_23_12.csv.gz
3 /home/cgaray/data/out/NEWINAPP__apple_ios__main__2015_02_26_23_12.csv.gz
4 /home/cgaray/data/out/NEWSUPPORTURL__apple_ios__main__2015_02_26_23_12.csv.gz
5 /home/cgaray/data/out/WEEKLYRATINGSCURRENT__apple_ios__main__2015_02_26_23_12.csv.gz
6 /home/cgaray/data/out/NEWCATEGORY__apple_ios__main__2015_02_26_23_12.csv.gz
7 /home/cgaray/data/out/NEWWEBSITE__apple_ios__main__2015_02_26_23_12.csv.gz
8 /home/cgaray/data/out/ALLVERSIONS__apple_ios__main__2015_02_26_23_12.csv.gz
9 /home/cgaray/data/out/DAILYRATINGSCURRENT__apple_ios__main__2015_02_26_23_12.csv.gz
10 /home/cgaray/data/out/MONTHLYRATINGS__apple_ios__main__2015_02_26_23_12.csv.gz
11 /home/cgaray/data/out/NEWREQ__apple_ios__main__2015_02_26_23_12.csv.gz
12 /home/cgaray/data/out/WEEKLYRATINGS__apple_ios__main__2015_02_26_23_12.csv.gz
13 /home/cgaray/data/out/NEWDEVELOPER__apple_io

In [63]:
print("DONE")

DONE


In [64]:
len(all_tables)

26