# Template for all parsing
```python
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def DUMPNAME(file_in,out_base,compress=1):
    headers = [] # list of column names
    out_filename = out_base.format('DUMPNAME')
    
    # Things to remmember as having "seen"
    seen = set()
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            # get some attribute, cast to strting if need be
            # what you're "remmebering" can also be a tuple (not list, they're not hashable)
            SOMEFIELD = json.get('SOMEFIELD','')
            if isinstance(category,str):
                category = category.strip()
                # double check that it's a valid field, non empty and not seen, etc.
                if running.get(json['app_id'],'')!=SOMEFIELD and len(SOMEFIELD)>0:
                    # SOME CODE HERE
                    csv_writer.writerow(obs)
    return out_filename
```

Things to keep in mind:
1. `file_in,out_base,compress` are *always* the only parameters to the function
2. Function name exactly equals what is passed to `format` on the second line.
3. Must use GZIP module with mode 'wt'
4. Must use python 3.4+ CSV writer
5. All functions only make a single pass through the data.
6. Write code that is rigid and assume fields exist and are not null. If they *are missing or null*, then you'll get an exception and have to deal with it then. This is to prevent potential scrape errors from passing unnoticed.
7. Use `debug()` It comes in super handy.
8. Functions always return the complete filename of the parsed file.

# Initialize Environment

### Compatibility notes

The following code is only compatible with a python 3.4+ kernel and the notebook must be opened with ipython 3.0+.

The following libraries must be installed to run this code:

```pip install ujson```

In [1]:
from pdb import set_trace as debug
from pandas.io.parsers import read_csv
from IPython.parallel import Client,require
from collections import Counter

In [2]:
import ujson
import datetime
import re
import gzip
import csv

__`RAW_FILE_DIR`__  The directory that contains google_play__main.json.gz

__`OUT_DIR`__ with the directory of where you want the processed files to go

__`DEBUG`__ whether to print out created CSV files or not

__`COMPRESS_LEVEL`__ gzip compression level. Severely impacts runtime.

__`LINE_LIMIT`__ how many lines to iterate through in the raw file. For debugging.

In [17]:
DUMP_DATE = '2015_02_26_23_12'
RAW_FILE_DIR = '/home/cgaray/data'
OUT_DIR = '/home/cgaray/data/out'
DEBUG = 1
COMPRESS_LEVEL = 1
LINE_LIMIT = 1000 # -1 means run for the entire file
NO_VERIFY_APP_ID_AND_TIMESTAMP = 1
PARALLEL = 0

In [4]:
# Don't change these!
RAW_FILE = '{}/google_play__main_sql.json.gz'.format(RAW_FILE_DIR)
OUT_BASE = OUT_DIR+'/{}__google_play__main_sql__'+DUMP_DATE+'.csv.gz'

In [18]:
def iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT,NO_VERIFY_APP_ID_AND_TIMESTAMP=NO_VERIFY_APP_ID_AND_TIMESTAMP):
    with gzip.open(filename,'rt') as file_iter:
        for c,line in enumerate(file_iter):
            if c > LINE_LIMIT and LINE_LIMIT>0:
                break
            if isinstance(line,str):
                if len(line)>0:        
                    out = ujson.loads(line)
                    if isinstance(out,dict):
                        if 'app_id' in out and 'timestamp' in out \
                        or NO_VERIFY_APP_ID_AND_TIMESTAMP:
                            yield out

Print a couple observations to inspect raw data.

In [19]:
!zcat "$RAW_FILE" | head -n 100 | tail -n2

{"overallrating":4.5,"dev_website":"http:\/\/greenlifeapps.appspot.com","appname":"Italian Translator\/Dictionary","devname":"GreenLife Apps","permission5":"TITLE: Network communication \n DESCRIPTION: view network state\n DESCRIPTION_FULL: Allows the app to view the state of all networks.\n","appdescription":"Its fun to translate!\nTranslate from\/to Italian and English\nIncludes dictionary\nWord of the day\nSpeak an English sentence and hear Italian translation\n** The app does not stop working after any number of uses. It simply displays a message. This is our full time job and we need to pay bills too.\nNOTE: the app now uses SMS permission so you can translate incoming SMS. The app will not read your SMS for any other purpose, or send SMS to you or anyone.\n** Requires active internet connection\n- Voice recognition for English and Italian\n- Share translation on Facebook\n- Copy to other apps\n- send as SMS or eMail\n- Hear pronunciation\n- Word of the day\n- new feature: transla

# NEWNAME

In [15]:
def NEWNAME(file_in,out_base,compress=1):
    headers = ['app_id','date','name']
    out_filename = out_base.format('NEWNAME')
    running = {}
    re_app_id=re.compile(r'id=(.*?)\&')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            appname = json.get('appname','')
            if isinstance(appname,str) and len(appname)>0:
                appname = appname.strip()
                app_id = re_app_id.findall(json.get('app_url',''))[0]
                if running.get(json['app_id'],'')!=appname and len(appname)>0 and len(app_id)>0:
                    date = datetime.datetime.strptime(json['reportdate'], '%M-%d-%Y').strftime('%Y-%m-%d')
                    running[app_id] = appname
                    obs = (app_id,date,appname)
                    csv_writer.writerow(obs)
    return out_filename

In [20]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWNAME(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

Unnamed: 0,app_id,date,name
0,com.dekryptedit.LinuxCommands,2012-01-07,Linux Commands
1,com.phonegap.nwtsearch,2012-01-07,NWT Search
2,william.need.fuck,2012-01-16,Homegrown Dandelions
3,com.shivay.SBGitaHi4,2012-01-07,Shrimad Bhagavad Gita
4,com.v1_4.B99AFC3E903FC49482901EC3.com,2012-01-07,Como Orar
5,com.Ababneh.face.language,2012-01-07,ØªØ­ÙÙÙ Ø§ÙØ´Ø®ØµÙØ© : ÙÙØ§ÙØ­ Ø§ÙÙØ¬Ù
6,RedBinary.OhmsLaw,2012-01-07,Ohm's Law
7,com.deviac.wikipedia.es,2012-01-07,Wikipedia Espanol
8,com.andromo.dev4168.app38050,2012-01-07,Landscaping Ideas!
9,com.ebooks.ebookreader,2012-01-07,Ebook Reader


# NEWCATEGORY

In [None]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWCATEGORY(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWVERSION

In [None]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWVERSION(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWINSTALL

In [None]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWINSTALL(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWPRICE

In [None]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWPRICE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# Run them all!

In [1]:
programs = """
NEWNAME
NEWINSTALL
NEWVERSION
NEWCATEGORY
NEWPRICE
""".split()

def run_function(f,RAW_FILE=RAW_FILE,OUT_BASE=OUT_BASE,COMPRESS_LEVEL=COMPRESS_LEVEL):
    return f(RAW_FILE ,OUT_BASE,COMPRESS_LEVEL)

for file in map(run_function,[globals()[x] for x in programs]):
    print(file)

In [47]:
print("DONE")

DONE
