# Initialize Environment

### Compatibility notes

The following code is only compatible with a python 3.4 kernel and the notebook must be opened with ipython 3.0+.

In [2]:
from pdb import set_trace as debug
from pandas.io.parsers import read_csv
from IPython.parallel import Client,require
from collections import Counter

In [3]:
import ujson
import datetime
import re
import gzip
import csv

In [4]:
#--> RAW_FILE_DIR with the directory that contains google_play__main.json.gz
#--> OUT_DIR with the directory of where you want the processed files to go
#--> DEBUG whether to print out created CSV files or not
#--> COMPRESS_LEVEL gzip compression level. warning: severely impacts runtime.
#--> LINE_LIMIT how many lines to iterate through in the raw file. For debugging.
DUMP_DATE = '2015_02_26_23_12'
RAW_FILE_DIR = '/data'
OUT_DIR = '/raid/out'
DEBUG = 0
COMPRESS_LEVEL = 1
LINE_LIMIT = -1
PARALLEL = 1
PIGZ = 0

In [5]:
# Don't change these!
RAW_FILE = '{}/google_play__main.json.gz'.format(RAW_FILE_DIR)
OUT_BASE = OUT_DIR+'/{}__google_play__main__'+DUMP_DATE+'.csv.gz'

In [6]:
if PIGZ:
    @require(gzip)
    def iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT):
        return gzip.iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT)
        
else:
    @require(gzip,ujson)
    def iter_json_gzip(filename,LINE_LIMIT=LINE_LIMIT):
        with gzip.open(filename,'rt') as file_iter:
            for c,line in enumerate(file_iter):
                if c > LINE_LIMIT and LINE_LIMIT>0:
                    break
                if isinstance(line,str):
                    if len(line)>0:        
                        out = ujson.loads(line)
                        if isinstance(out,dict):
                            if 'app_id' in out and 'timestamp' in out:
                                yield out

In [7]:
# Print a couple observations to inspect daw data.
# !zcat "$RAW_FILE"|head -n 10 | tail -n2

#Initialize Parallel Processing

In [None]:
if PARALLEL:
    ipython_parallel = Client()
    print("{} active computing engines".format(len(ipython_parallel.ids)))

    lbv = ipython_parallel.load_balanced_view()

    map = lambda f,itertable:lbv.map(f,itertable,\
    block =False,\
    ordered =False)

    @require('socket')
    def host(dummy):
        return socket.gethostname()
    
    nodes = list(lbv.map(host,ipython_parallel.ids))

    print(dict(Counter(nodes).items()))

# DAILYRATINGS

In [8]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def DAILYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','date','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('DAILYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .strftime('%Y-%m-%d')
            rating = json.get('ratings',None)
            if rating:
                obs = (json['app_id'],day,tuple(rating),)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],day]+rating)
    return out_filename

In [9]:
df_debug = None
if DEBUG:
    df_debug = read_csv(DAILYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# WEEKLYRATINGS 

In [10]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def WEEKLYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','year','week','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('WEEKLYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            year,week = datetime.datetime.fromtimestamp(int(json['timestamp']))                                .isocalendar()[:2]
            rating = json.get('ratings',None)
            if rating:
                obs = (json['app_id'],year,week)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],year,week]+rating)
    return out_filename

In [11]:
df_debug = None
if DEBUG:
    df_debug = read_csv(WEEKLYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# MONTHLYRATINGS

In [12]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def MONTHLYRATINGS(file_in,out_base,compress=1):
    headers = ['app_id','year','month','rating5','rating4','rating3','rating2','rating1']
    out_filename = out_base.format('MONTHLYRATINGS')
    unique_daily_rating = set()
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            dateobj = datetime.datetime.fromtimestamp(int(json['timestamp']))
            year,month = dateobj.year,dateobj.month
            rating = json.get('ratings',None)
            if rating:
                obs = (json['app_id'],year,month)
                if hash(obs) not in unique_daily_rating:
                    unique_daily_rating.add(hash(obs))
                    csv_writer.writerow([json['app_id'],year,month]+rating)
    return out_filename

In [13]:
df_debug = None
if DEBUG:
    df_debug = read_csv(MONTHLYRATINGS(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWWEBSITE

In [14]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWWEBSITE(file_in,out_base,compress=1):
    headers = ['app_id','date','url']
    out_filename = out_base.format('NEWWEBSITE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        http = re.compile(r"(http://)|(https://)")
        www = re.compile(r"^www\.")
        ending = re.compile(r"\.(a-z)(.*?)^")
        bad = re.compile(r"\.[a-z]+(.*?)$")
        for json in iter_json_gzip(file_in):
            website = json.get('website','')
            if isinstance(website,str):
                website = http.sub('',website).lower()
                website = www.sub('',website)
                website = website.split(r"/")[0]
                bad_str = bad.findall(website)
                if bad_str:
                    website = website.replace(bad_str[0],'')
                if running.get(json['app_id'],'')!=website and len(website)>0:
                    running[json['app_id']] = website
                    day = datetime.datetime.\
                    fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d').strip()
                    obs = (json['app_id'],day,website)
                    csv_writer.writerow(obs)
    return out_filename

In [15]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWWEBSITE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWPRIVACYDOMAIN

In [16]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWPRIVACYDOMAIN(file_in,out_base,compress=1):
    headers = ['app_id','date','url']
    out_filename = out_base.format('NEWPRIVACYDOMAIN')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        http = re.compile(r"(http://)|(https://)")
        www = re.compile(r"^www\.")
        ending = re.compile(r"\.(a-z)(.*?)^")
        bad = re.compile(r"\.[a-z]+(.*?)$")
        for json in iter_json_gzip(file_in):
            website = json.get('privacy_policy','')
            if isinstance(website,str):
                website = http.sub('',website).lower()
                website = www.sub('',website)
                website = website.split(r"/")[0]
                bad_str = bad.findall(website)
                if bad_str:
                    website = website.replace(bad_str[0],'')
                if running.get(json['app_id'],'')!=website and len(website)>0:
                    running[json['app_id']] = website
                    day = datetime.datetime.\
                    fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d').strip()
                    obs = (json['app_id'],day,website)
                    csv_writer.writerow(obs)
    return out_filename

In [17]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWPRIVACYDOMAIN(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWSIMILAR5

In [18]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWSIMILAR5(file_in,out_base,compress=1,number_similar = 5):
    headers = ['app_id','date']+ ['related{}'.format(x) \
                                  for x in range(1,number_similar+1)]
    out_filename = out_base.format('NEWSIMILAR{}'.format(number_similar))
    running = {}
    with gzip.open(out_filename,'wt',9) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            related = json.get('similar_apps',[])
            if related:
                related = sorted(map(str, related[:number_similar]))
                joined = ",".join(related)
                if running.get(json['app_id'],'')!=joined:
                    running[json['app_id']] = joined
                    day = datetime.datetime.\
                    fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d').strip()
                    csv_writer.writerow([json['app_id'],day]+ related)
    return out_filename

In [19]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIMILAR5(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWSIMILAR10

In [20]:
@require(NEWSIMILAR5,'ujson','datetime','re','gzip','csv')
def NEWSIMILAR10(RAW_FILE,OUT_BASE,COMPRESS_LEVEL):
    return NEWSIMILAR5(RAW_FILE,OUT_BASE,COMPRESS_LEVEL,10)

In [21]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIMILAR10(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWSIMILAR15

In [22]:
@require(NEWSIMILAR5,'ujson','datetime','re','gzip','csv')
def NEWSIMILAR15(RAW_FILE,OUT_BASE,COMPRESS_LEVEL):
    return NEWSIMILAR5(RAW_FILE,OUT_BASE,COMPRESS_LEVEL,15)

In [23]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWSIMILAR15(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWDEVELOPER

In [24]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWDEVELOPER(file_in,out_base,compress=1):
    headers = ['app_id','date','dev_id']
    out_filename = out_base.format('NEWDEVELOPER')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            developer = json.get('developer_id','')
            if isinstance(developer,str):
                developer = developer.strip()
                if running.get(json['app_id'],'')!=developer and len(developer)>0:
                    running[json['app_id']] = developer
                    day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d')
                    obs = (json['app_id'],day,developer)
                    csv_writer.writerow(obs) 
    return out_filename

In [25]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWDEVELOPER(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWCATEGORY

In [26]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWCATEGORY(file_in,out_base,compress=1):
    headers = ['app_id','date','category']
    out_filename = out_base.format('NEWCATEGORY')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            category = json.get('category','')
            if isinstance(category,str):
                category = category.strip().upper()
                if running.get(json['app_id'],'')!=category and len(category)>0:
                    running[json['app_id']] = category
                    day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d')
                    obs = (json['app_id'],day,category)
                    csv_writer.writerow(obs)
    return out_filename

In [27]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWCATEGORY(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWPRICE

In [28]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWPRICE(file_in,out_base,compress=1):
    headers = ['app_id','date','price']
    out_filename = out_base.format('NEWPRICE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            price = str(json.get('price',''))
            if price =='0.00':
                price = '0'
            if running.get(json['app_id'],'')!=price and price != '.' and len(price)>0:
                running[json['app_id']] = price
                day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                        .strftime('%Y-%m-%d')
                obs = (json['app_id'],day,price)
                csv_writer.writerow(obs)
    return out_filename

In [29]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWPRICE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWVERSION

In [30]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWVERSION(file_in,out_base,compress=1):
    headers = ['app_id','date','version']
    out_filename = out_base.format('NEWVERSION')
    with gzip.open(out_filename,'wt',compress,) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        running = {}
        nowhite = re.compile(r"\s+")
        for json in iter_json_gzip(file_in):
            version = json.get('version','')
            if isinstance(version,str):
                version = version.strip()
                if len(version)>0:
                    version = version.replace('Varies with device','varies')
                    version = nowhite.sub('',version).lower()
                    if running.get(json['app_id'],'')!=version and len(version)>0:
                        running[json['app_id']] = version
                        day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .strftime('%Y-%m-%d')
                        obs = (json['app_id'],day,version)
                        csv_writer.writerow(obs)
    return out_filename

In [31]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWVERSION(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWVERSION

In [32]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWUPDATED(file_in,out_base,compress=1):
    headers = ['app_id','updated',]
    out_filename = out_base.format('NEWUPDATED')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        running = {}
        nowhite = re.compile(r"\s+")
        for json in iter_json_gzip(file_in):
            updated = json.get('updated','')
            if isinstance(updated,str):
                updated = updated.strip()
                if running.get(json['app_id'],'')!=updated and len(updated)>0:
                    running[json['app_id']] = updated
                    date = datetime.datetime.strptime(updated,'%B %d, %Y')
                    date = date.strftime('%Y-%m-%d')
                    obs = (json['app_id'],date)
                    csv_writer.writerow(obs)
    return out_filename

In [33]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWUPDATED(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWREQ

In [34]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWREQ(file_in,out_base,compress=1):
    headers = ['app_id','date','requires']
    out_filename = out_base.format('NEWREQ')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        running = {}
        nowhite = re.compile(r"\s+")
        and_up = re.compile(r" and up")
        for json in iter_json_gzip(file_in):
            requires = json.get('requires','')
            if isinstance(requires,str):
                if len(requires)>0:
                    requires = requires.lower().strip()
                    requires = requires.replace('varies with device','varies')
                    requires = and_up.sub('+',requires)
                    requires = nowhite.sub('',requires)
                    if running.get(json['app_id'],'')!=requires and len(requires)>0:
                        running[json['app_id']] = requires
                        day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                                .strftime('%Y-%m-%d')
                        obs = (json['app_id'],day,requires)
                        csv_writer.writerow(obs)
    return out_filename

In [35]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWREQ(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWINSTALL

In [36]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWINSTALL(file_in,out_base,compress=1):
    headers = ['app_id','date','installs']
    out_filename = out_base.format('NEWINSTALL')
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        nowhite = re.compile(r"\s+")
        running = {}
        for json in iter_json_gzip(file_in):
            installs = json.get('installs','')
            if isinstance(installs,str):
                installs = nowhite.sub('',installs)
                if running.get(json['app_id'],'')!=installs and len(installs)>0:
                    running[json['app_id']] = installs
                    day = datetime.datetime.fromtimestamp(int(json['timestamp']))\
                            .strftime('%Y-%m-%d')
                    obs = (json['app_id'],day,installs)
                    csv_writer.writerow(obs)
    return out_filename

In [37]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWINSTALL(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWNAME

In [None]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWNAME(file_in,out_base,compress=1):
    headers = ['app_id', 'date', 'name'] # list of column names
    out_filename = out_base.format('NEWNAME')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            # get some attribute, cast to strting if need be
            # what you're "remmebering" can also be a tuple (not list, they're not hashable)
            name = json.get('app_name','')
            if isinstance(name,str):
                name = name.strip()
                # double check that it's a valid field, non empty and not seen, etc.
                if running.get(json['app_id'],'')!=name and len(name)>0:
                    # SOME CODE HERE
                    running[json['app_id']] = name
                    obs = [json['app_id'], str(datetime.date.fromtimestamp(int(json.get('timestamp', '')))), name]
                    csv_writer.writerow(obs)
    return out_filename

In [None]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWNAME(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# NEWINAPP

In [None]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def NEWINAPP(file_in,out_base,compress=1):
    headers = ['app_id', 'date', 'has_inapp'] # list of column names
    out_filename = out_base.format('NEWINAPP')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            # get some attribute, cast to strting if need be
            # what you're "remmebering" can also be a tuple (not list, they're not hashable)
            name = str(bool(json.get('hasInAppPurchases','0')))
            if isinstance(name,str):
                name = name.strip()
                # double check that it's a valid field, non empty and not seen, etc.
                if running.get(json['app_id'],'')!=name and len(name)>0:
                    # SOME CODE HERE
                    running[json['app_id']] = name
                    obs = [json['app_id'], str(datetime.date.fromtimestamp(int(json.get('timestamp', '')))), name]
                    csv_writer.writerow(obs)
    return out_filename

In [None]:
df_debug = None
if DEBUG:
    df_debug = read_csv(NEWINAPP(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# DAILYSCRAPE

In [None]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def DAILYSCRAPE(file_in,out_base,compress=1):
    headers = ['app_id', 'day'] # list of column names
    out_filename = out_base.format('DAILYSCRAPE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            # get some attribute, cast to strting if need be
            # what you're "remmebering" can also be a tuple (not list, they're not hashable)
            day = str(datetime.date.fromtimestamp(json.get('timestamp','')).day)
            if isinstance(day,str):
                day = day.strip()
                # double check that it's a valid field, non empty and not seen, etc.
                if running.get(json['app_id'],'')!=day and len(day)>0:
                    # SOME CODE HERE
                    running[json['app_id']] = day
                    obs = [json['app_id'], day]
                    csv_writer.writerow(obs)
    return out_filename

In [None]:
df_debug = None
if DEBUG:
    df_debug = read_csv(DAILYSCRAPE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# MONTHLYSCRAPE

In [None]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def MONTHLYSCRAPE(file_in,out_base,compress=1):
    headers = ['app_id', 'month'] # list of column names
    out_filename = out_base.format('MONTHLYSCRAPE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            # get some attribute, cast to strting if need be
            # what you're "remmebering" can also be a tuple (not list, they're not hashable)
            month = str(datetime.date.fromtimestamp(json.get('timestamp','')).month)
            if isinstance(month,str):
                month = month.strip()
                # double check that it's a valid field, non empty and not seen, etc.
                if running.get(json['app_id'],'')!=month and len(month)>0:
                    # SOME CODE HERE
                    running[json['app_id']] = month
                    obs = [json['app_id'], month]
                    csv_writer.writerow(obs)
    return out_filename

In [None]:
df_debug = None
if DEBUG:
    df_debug = read_csv(MONTHLYSCRAPE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# WEEKLYSCRAPE

In [None]:
@require(iter_json_gzip,'ujson','datetime','re','gzip','csv')
def WEEKLYSCRAPE(file_in,out_base,compress=1):
    headers = ['app_id', 'year', 'week'] # list of column names
    out_filename = out_base.format('WEEKLYSCRAPE')
    running = {}
    with gzip.open(out_filename,'wt',compress) as out_file:
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(headers)
        for json in iter_json_gzip(file_in):
            # get some attribute, cast to strting if need be
            # what you're "remmebering" can also be a tuple (not list, they're not hashable)
            date = (str(datetime.date.fromtimestamp(json.get('timestamp','')).year), str(datetime.date.fromtimestamp(json.get('timestamp','')).isocalendar()[1]))
            if isinstance(date,str):
                date = date.strip()
                # double check that it's a valid field, non empty and not seen, etc.
                if running.get(json['app_id'],'')!=date and len(date)>0:
                    # SOME CODE HERE
                    running[json['app_id']] = date
                    obs = [json['app_id'], date[0], date[1]]
                    csv_writer.writerow(obs)
    return out_filename

In [None]:
df_debug = None
if DEBUG:
    df_debug = read_csv(WEEKLYSCRAPE(RAW_FILE,OUT_BASE,COMPRESS_LEVEL),compression='gzip',header=0)
df_debug

# Run them all!

In [None]:
programs = """
NEWNAME
NEWINAPP
DAILYSCRAPE
MONTHLYSCRAPE
WEEKLYSCRAPE
""".split()

def run_function(f,RAW_FILE=RAW_FILE,OUT_BASE=OUT_BASE,COMPRESS_LEVEL=COMPRESS_LEVEL):
    return f(RAW_FILE ,OUT_BASE,COMPRESS_LEVEL)

for file in map(run_function,[globals()[x] for x in programs]):
    print(file)

In [None]:
print("DONE")