# Initialize Environment

### Compatibility notes

The following code is only compatible with a python 3.4 kernel and the notebook must be opened with ipython 3.0+.

The following libraries must be installed to run this code:

```pip install ujson```

In [2]:
from pdb import set_trace as debug
from pandas.io.parsers import read_csv
from IPython.parallel import Client,require
from collections import Counter

In [3]:
import ujson
import datetime
import re
import gzip
import csv

In [1]:
#--> RAW_FILE_DIR with the directory that contains google_play__main.json.gz
#--> OUT_DIR with the directory of where you want the processed files to go
#--> DEBUG whether to print out created CSV files or not
#--> COMPRESS_LEVEL gzip compression level. warning: severely impacts runtime.
#--> LINE_LIMIT how many lines to iterate through in the raw file. For debugging.
DUMP_DATE = '2015_02_26_23_12'
RAW_FILE_DIR = '/home/cgaray/data'
OUT_DIR = '/home/cgaray/data/out'
DEBUG = 0
COMPRESS_LEVEL = 1
LINE_LIMIT = -1
PARALLEL = 1

In [2]:
# Don't change these!
RAW_FILE = '{}/apple_ios__main.json.gz'.format(RAW_FILE_DIR)
OUT_BASE = OUT_DIR+'/{}__apple_ios__main__'+DUMP_DATE+'.csv.gz'

In [6]:
def iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT):
    with gzip.open(filename,'rt') as file_iter:
        c = 0
        for line in file_iter:
            c +=1
            if c > LINE_LIMIT and LINE_LIMIT>0:
                break
            yield ujson.loads(line)

In [9]:
# Print a couple observations to inspect daw data.
!zcat "$RAW_FILE"|head -n 1000 | tail -n5

{"ratingCountCurrentVersion":5,"totalNumberOfReviews":11,"copyright":"\u00a9 2011 Fluid Pixel","month":3,"ratingCount":19,"language":"English","artist_id":298530429,"categories":["Games","Board","Dice"],"store_name":"ios","description":"Liars Dice, the classic dice game of Bluff comes to the iPhone, iPod Touch and iPad.  Play against your friends on their iPhones and iPads or if you have more than three devices you can even use the iPad as a central table to control the game of up to 8 players.\n\nYou\u2019ll need to have another person to play against locally on a different device as the game uses Bluetooth to find other games in the area.","hostname":"equity4.mit.edu","kind":"iosSoftware","day":27,"size":10188858,"moreByThisDeveloper":[501002274],"version":[{"release-date":"Feb 07, 2012","version-string":"1.2","release-notes":"Thanks for all the feedback, we have been working hard on a new version that fixes the following issues:\r\n\r\n- Interface Improvements\r\n- New Random Genera

# Initialize Parallel Computing

In [6]:
if PARALLEL:
    ipython_parallel = Client()
    print("{} active computing engines".format(len(ipython_parallel.ids)))

    lbv = ipython_parallel.load_balanced_view()

    map = lambda f,itertable:lbv.map(f,itertable,\
    block =False,\
    ordered =False)

    @require('socket')
    def host(dummy):
        return socket.gethostname()
    
    nodes = list(lbv.map(host,ipython_parallel.ids))
    nodes = [int(x.split('equity')[1].split(".")[0]) for x in nodes]
    nodes = sorted(nodes)

    print(list(Counter(nodes).items()))

# Daily Ratings -- DAILYRATINGS

In [28]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def DAILYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','date','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('DAILYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .strftime('%Y-%m-%d')
            rating = json.get('ratingCountList',None)
            if rating:
                obs = (json['app_id'],day,tuple(rating),)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],day]+rating)
    return out_filename

In [29]:
df_debug = None
if DEBUG:
    df_debug = read_csv(DAILYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,rating5,rating4,rating3,rating2,rating1
0,333208122,2013-03-27,233,99,112,76,319
1,447548412,2013-03-27,1,1,1,0,9
2,333378940,2013-03-27,3,0,2,0,4
3,494794787,2013-03-27,3,1,1,0,5
4,469990268,2013-03-27,44,3,13,13,74
5,568421135,2013-03-27,15,5,8,10,29
6,288373421,2013-03-27,55,33,25,17,30
7,346672821,2013-03-27,4,3,7,10,44
8,353415141,2013-03-27,17,9,4,3,14
9,327519114,2013-03-27,35,11,12,4,7


# Artist - app_id -- NEWARTISTID

In [30]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWDEVELOPER(file_in,out_base,compress=1):
    headers = ['app_id','date','artist_id']
    out_filename = out_base.format('NEWARTISTID')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            developer = '{}'.format(json['artist_id'])
            if running.get(json['app_id'],None)!=developer:
                running[json['app_id']] = developer
                day = datetime.datetime.\
                fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d').strip()
                obs = (json['app_id'],day,developer)
                csv_writer.writerow(obs)
    return out_filename

In [31]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWARTISTID(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,artist_id
0,477033794,2013-03-27,477033797
1,494296216,2013-03-27,494296219
2,465608546,2013-03-27,403287102
3,498191169,2013-03-27,498191172
4,333208122,2013-03-27,328282863
5,447548412,2013-03-27,431586531
6,568386333,2013-03-27,568386336
7,564550227,2013-03-27,542898284
8,555915492,2013-03-27,447033331
9,480092632,2013-03-27,414617230


# Category - app_id -- NEWCATEGORY

In [32]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWCATEGORY(file_in,out_base,compress=1):
    headers = ['app_id','date']+ ['category{}'.format(x) \
                                  for x in range(1,5)]
    out_filename = out_base.format('NEWCATEGORY')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            category = json.get('categories',[])[:4]
            joined = ",".join(category)
            if running.get(json['app_id'],'')!=joined:
                running[json['app_id']] = joined
                day = datetime.datetime.\
                fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d').strip()
                csv_writer.writerow([json['app_id'],day]+ category)
    return out_filename

In [33]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWCATEGORY(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,category1,category2,category3,category4
0,477033794,2013-03-27,Music,Utilities,,
1,494296216,2013-03-27,Sports,Education,,
2,465608546,2013-03-27,Utilities,Entertainment,,
3,498191169,2013-03-27,Medical,Health & Fitness,,
4,333208122,2013-03-27,Utilities,Education,,
5,447548412,2013-03-27,Education,Educational,Games,Kids
6,568386333,2013-03-27,Social Networking,,,
7,564550227,2013-03-27,Music,Education,,
8,555915492,2013-03-27,Games,Family,Entertainment,Kids
9,480092632,2013-03-27,Games,Kids,Family,


# Price changes -- NEWPRICE

In [34]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWPRICE(file_in,out_base,compress=1):
    headers = ['app_id','date','price']
    out_filename = out_base.format('NEWPRICE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            price = str(json.get('price','')).replace("$","")
            if running.get(json['app_id'],'')!=price:
                running[json['app_id']] = price
                day = datetime.datetime.\
                fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d').strip()
                obs = (json['app_id'],day,price)
                csv_writer.writerow(obs) 
    return out_filename

In [35]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWPRICE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,price
0,477033794,2013-03-27,0.00
1,494296216,2013-03-27,3.99
2,465608546,2013-03-27,5.99
3,498191169,2013-03-27,0.00
4,333208122,2013-03-27,0.00
5,447548412,2013-03-27,2.99
6,568386333,2013-03-27,0.00
7,564550227,2013-03-27,0.00
8,555915492,2013-03-27,0.99
9,480092632,2013-03-27,0.99


# All Unique Versions  -- ALLVERSIONS

In [36]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def ALLVERSIONS(file_in,out_base,compress=1):
    headers = ['app_id','date','version_string',\
#                'release_notes'\
              ]
    out_filename = out_base.format('DAILYRATINGS')
    unique_ver = set()
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            for each_version in json.get('version',[]):
                # need extra code because Apple changed format
                if 'release-date' in each_version:
                    date = each_version.get('release-date','')
                    date = datetime.datetime.strptime(date,'%b %d, %Y')
                    date = date.strftime('%Y-%m-%d')
                    version_string = each_version.get('version-string','')
                    release_notes = each_version.get('release-notes','')
                elif 'releaseDate' in each_version:
                    date = each_version.get('releaseDate','')
                    date = datetime.datetime.strptime(date,'%Y-%m-%dT%H:%M:%SZ')
                    date = date.strftime('%Y-%m-%d')
                    version_string = each_version.get('versionString','')
                    release_notes = each_version.get('releaseNotes','')
                obs_u = (json['app_id'],date,version_string)
                if hash(obs_u) not in unique_ver:
                    unique_ver.add(hash(obs_u))
                    obs = (json['app_id'],date,
                           version_string,
#                            release_notes,
                          )
                    csv_writer.writerow(obs)
    return out_filename

In [24]:
df_debug = None
if DEBUG:
    df_debug = read_csv(ALLVERSIONS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,version_string
0,477033794,2011-11-04,1.0
1,494296216,2012-11-06,1.8.3
2,494296216,2012-09-10,1.8.2
3,494296216,2012-02-09,1.7.1
4,494296216,2012-01-26,1.7
5,465608546,2012-06-06,1.1
6,465608546,2011-09-22,1.0
7,498191169,2012-05-23,1.2
8,498191169,2012-03-06,1.1
9,498191169,2012-02-28,1.0


# Requires Version Changes -- NEWREQ

In [25]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWREQ(file_in,out_base,compress=1):
    headers = ['app_id','date','requires','optimized']
    out_filename = out_base.format('NEWREQ')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file, dialect='excel')
        csv_writer.writerow(headers)
        running = {}
        and_re = re.compile(r"and ")
        com_with_re = re.compile(r"Compatible with ")
        optimize_re = re.compile(r"This app is optimized for ")
        for json in iter_json_gzip(file_in):
            requires = json.get('requirements',None)
            if requires == None:
                continue
            re_split = requires.split(".")
            requires = com_with_re.sub('',re_split[0]).strip()
            requires = and_re.sub('',requires)
            optimized = None
            if len(re_split)>1:
                optimized = re_split[1].strip()
                optimized = optimize_re.sub("",optimized)
            if running.get(json['app_id'],'')!=json.get('requirements',''):
                running[json['app_id']] = json.get('requirements','')
                day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d')
                obs = (json['app_id'],day,requires,optimized)
                csv_writer.writerow(obs)
    return out_filename

In [26]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWREQ(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,requires,optimized
0,477033794,2013-03-27,"iPhone 3GS, iPhone 4, iPhone 4S, iPhone 5, iPo...",
1,494296216,2013-03-27,"iPhone, iPod touch, iPad",
2,465608546,2013-03-27,iPad,
3,498191169,2013-03-27,"iPhone 3GS, iPhone 4, iPhone 4S, iPhone 5, iPo...",
4,333208122,2013-03-27,"iPhone, iPod touch, iPad",
5,447548412,2013-03-27,"iPhone, iPod touch, iPad",
6,568386333,2013-03-27,"iPhone, iPod touch, iPad",iPhone 5
7,564550227,2013-03-27,"iPhone, iPod touch, iPad",
8,555915492,2013-03-27,"iPhone, iPod touch, iPad",
9,480092632,2013-03-27,"iPhone, iPod touch, iPad",


# Run them all!

In [37]:
programs = """
DAILYRATINGS
NEWDEVELOPER
NEWCATEGORY
NEWPRICE
ALLVERSIONS
NEWREQ
""".split()

def run_function(f,RAW_FILE=RAW_FILE,OUT_BASE=OUT_BASE,COMPRESS_LEVEL=COMPRESS_LEVEL):
    return f(RAW_FILE ,OUT_BASE,COMPRESS_LEVEL)

for file in map(run_function,[globals()[x] for x in programs]):
    print(file)
print("DONE")

/home/cgaray/data/out/DAILYRATINGS__apple_ios__main__2015_02_26_23_12.csv.gz
/home/cgaray/data/out/NEWARTISTID__apple_ios__main__2015_02_26_23_12.csv.gz
/home/cgaray/data/out/NEWCATEGORY__apple_ios__main__2015_02_26_23_12.csv.gz
/home/cgaray/data/out/NEWPRICE__apple_ios__main__2015_02_26_23_12.csv.gz
/home/cgaray/data/out/DAILYRATINGS__apple_ios__main__2015_02_26_23_12.csv.gz
/home/cgaray/data/out/NEWREQ__apple_ios__main__2015_02_26_23_12.csv.gz
DONE
