In [None]:
!scancel -u aschade

In [None]:
#%%
from os import makedirs
from os.path import exists
from shutil import rmtree # removes folder with everything in it
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")    # to suppress the dask metadata warnings (meta arg seems to be broken, not happy with anything i give)

from tqdm import tqdm
from time import time
import logging as log
import sys
import csv

import matplotlib.pyplot as plt
import matplotlib.dates as matdates
import numpy as np
import pandas as pd
pd.options.display.max_rows = 150
pd.options.display.max_columns = 50
import seaborn as sns
sns.set_style("whitegrid")

import dask
import dask.dataframe as dd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

########################################################################################################

def floor(test, limit):
    return limit if test < limit else test

def ceiling(test, limit):
    return limit if test > limit else test

In [None]:
cluster = SLURMCluster(
    cores=40,                          
    memory='500GB',  
    
    local_directory='~/scratch',
    job_extra=[
        '--time=10:00:00',
        
        '--partition=haswell',    
        '--nodes=1',
        
        '--job-name=dask',
        '--output=dask.out', 
        '--error=dask.error', 
        '--mail-user=aaron.schade@upf.edu',
        '--mail-type=NONE', 
    ],    
    n_workers=1,                 # this is internal to one job? one node? 
    
    interface='ib0',               # workers, no diag: em1, em2, ib0,   # no workers: lo, em1.851, idrac, em3 & em4 (no ipv4)
    scheduler_options={
#         'interface': 'em1',      # it wont allow you specify both an interface AND a host address
        'host': '10.30.50.163',    # launch on this address, open dashboard on the other?
    },
)
cluster.scale(jobs=1)


scheduler = Client(cluster)
print(scheduler)
dashboardLink = scheduler.dashboard_link.replace('10.30.50.163', '10.60.110.163')
# dashboardLink = scheduler.dashboard_link
print(dashboardLink)
print(dashboardLink.replace('status', 'workers'))
print(dashboardLink.replace('status', 'graph'))

In [None]:
!squeue -u aschade

In [None]:
paywallsDF = pd.read_excel('paywalls.xlsx')
paywallsDF = paywallsDF[['paywall_date', 'url']]

paywalls = {}
for index, row in paywallsDF[:18].iterrows():
        paywalls[row['url']] = str(row['paywall_date'].date())

paywalls['nytimes.com-1'] = '2012-04-01'
paywalls['nytimes.com-2'] = '2017-12-01'

print('paywalls:')
pprint(paywalls, indent=4)

######################################

with open('newsSitesList.txt', 'r') as f:
    newsSitesList = [line.strip() for line in f.readlines()]

for site in paywalls.keys():
    if site not in newsSitesList:
        newsSitesList.append(site) 
       
print('\nnews sites:')
pprint(newsSitesList[:10], indent=4)

######################################

colsOfInterest = [
    # 'ref_domain_name', 
    'domain_name',
    'event_date',

    'pages_viewed', 
    'duration', 
    # 'event_time', 

    'hoh_most_education', 
    'census_region', 
    'household_size',
    'hoh_oldest_age', 
    'household_income', 
    'children', 
    'racial_background',
    'connection_speed', 
    'country_of_origin', 
    'zip_code', 
    ]
###################

individualCharacteristics = [
    'hoh_most_education', 
    'census_region', 
    'household_size',
    'hoh_oldest_age', 
    'household_income', 
    'children', 
    'racial_background',
    'connection_speed', 
    'country_of_origin', 
    'zip_code', 
    ]

In [None]:
for newspaperSite, paywallDate in paywalls.items():
    top10 = [
#         'latimes.com', 
#         'nypost.com', 
#         'bostonglobe.com', 
#         'chicagotribune.com', 
#         'startribune.com', 
#         'newsday.com', 
        'washingtonpost.com', 
        'inquirer.com', 
        'tampabay.com', 
        'sfchronicle.com',
    ]
    if newspaperSite not in top10: continue
        
    year = int(paywallDate[:4])

    ######################################################################################################

    print(f'\n' + f' {newspaperSite} '.center(80, '#'))

    prePWmonths = pd.Timestamp(paywallDate).dayofyear/30.5
    postPWmonths = 12 - prePWmonths
    if prePWmonths < 1 or postPWmonths < 1: continue

    idealRangeStart = pd.Timestamp(paywallDate) - pd.Timedelta('90 days')
    idealRangeEnd = pd.Timestamp(paywallDate) + pd.Timedelta('90 days')
    yearStart = pd.Timestamp(f'{year}-01-01')
    yearEnd = pd.Timestamp(f'{year}-12-31')

    rangeStart = floor(idealRangeStart, yearStart)
    rangeEnd = ceiling(idealRangeEnd, yearEnd)
    datesOfInterest = pd.date_range(rangeStart, rangeEnd, freq='W')

    print(f'paywall date: {paywallDate}')
    print(list(datesOfInterest.month.unique()))

    ######################################################################################################

    ddf = dd.read_parquet(
        f'../comscore/parquet/{year}', 
        index='machine_id', 
        columns=colsOfInterest,
        engine='fastparquet',   # you HAVE TO use the same engine to read as you did for creating the parquet files!!!  only then will fast index lookups work - however, you need the pyarrow (or python-snappy) package installed for google's amazing 'snappy' compression algo
        )
    ddf = ddf[ddf['event_date'].between(rangeStart, rangeEnd)]

    visitors = list(
        ddf.loc[ddf['domain_name'] == newspaperSite.strip('-12')]
        .index
        .unique()
        .compute()
        )

    
    resultsList = []
    

    for visitor in tqdm(visitors, desc='visitors done: '):
        
        # set index for later resampling
        temp = ddf.loc[visitor].set_index('event_date')
        temp['numberVisits'] = 1           # fill all rows with 1 -> when aggregated, becomes counter

        # get all websites they visited
        allSites = temp['domain_name'].unique()

        
        
        ######### create big ddf with all combinations of dates and sites for this user #########
        df1 = pd.DataFrame({'event_date': datesOfInterest})
        df2 = pd.DataFrame({'domain_name': allSites})
        df = pd.merge(df1, df2, how='cross')
        thisVisitorDF = dd.from_pandas(pd.DataFrame(df), chunksize=100_000)

        
        
        
        ######### fill in date stuff, default variables, demographics #########
        thisVisitorDF = thisVisitorDF.assign(
            machine_id =           visitor, 
            day_of_month =         thisVisitorDF.event_date.dt.day, 
            week_of_month =        (thisVisitorDF.event_date.dt.day - 1)//7 + 1, 
            month =                thisVisitorDF.event_date.dt.month, 
            year =                 thisVisitorDF.event_date.dt.year, 

            news_site_dummy =      thisVisitorDF.domain_name.isin(newsSitesList)
        )
        
#         demographics = temp[individualCharacteristics].loc[temp.index.min().compute()]
#         demographics = temp[individualCharacteristics].head(1, compute=False).map_partitions(lambda df: df.to_dict())
        demographics = temp[individualCharacteristics].head(1).squeeze().to_dict()
        for characteristic in individualCharacteristics:
            thisVisitorDF[characteristic] = demographics[characteristic]


            
            
            
            
        ######### aggregate by day and website, do not keep individual characteristics #########
        notIndividualCharacteristics = [col for col in temp.columns if col not in individualCharacteristics]
        if 'event_date' in notIndividualCharacteristics: notIndividualCharacteristics.remove('event_date')      # since now in index
#         metadata = {
#              'domain_name':   'str',
#              'event_date':    'datetime64[ns]',
#              'pages_viewed':  'int',
#              'duration':      'int',
#              'number_visits': 'int'}
#         meta = pd.DataFrame({
#              'domain_name':   ['str'],
#         #      'event_date':    pd.Timestamp('202datetime64[ns]',
#              'pages_viewed':  [3],
#              'duration':      [3],
#              'number_visits': [3],
#         })

        agged = (
            temp[notIndividualCharacteristics]
            .groupby('domain_name')
            .apply(lambda group: group.resample('W').sum())#, meta=meta)
            .reset_index(drop=False)
            )
        
        ######### merge and fill with zeroes #########
        thisVisitorDF = thisVisitorDF.merge(agged, how='left', on=['domain_name', 'event_date'])
        thisVisitorDF[['pages_viewed', 'duration', 'numberVisits']] = thisVisitorDF[['pages_viewed', 'duration', 'numberVisits']].fillna(0)

        resultsList.append(thisVisitorDF)
    




    results = dd.multi.concat(resultsList)
    results.to_csv(f'outputs/longTable/{newspaperSite}/long_table.csv', single_file=True)
    


In [18]:
newspaperSite = 'nytimes.com-1'
subfolder = f'outputs/longTable/{newspaperSite}/individuals'
ddf = dd.read_csv(
        f'{subfolder}/*.csv', 
        encoding_errors='replace', 
)


ddf.to_csv(f'outputs/longTable/{newspaperSite}/long_table.csv', single_file=True)

['/gpfs42/robbyfs/scratch/lab_rdurante/aschade/paywalls/outputs/longTable/nytimes.com-1/long_table.csv']

In [19]:
########################################## testing ####################################

ddf = dd.read_csv('outputs/longTable/nytimes.com-2/long_table.csv')
ddf.head()

Unnamed: 0,machine_id,date,day_of_month,week_of_month,month,year,domain_name,news_site_dummy,number_visits,number_pages_viewed,time_spent_on_site,hoh_most_education,census_region,household_size,hoh_oldest_age,household_income,children,racial_background,connection_speed,country_of_origin,zip_code
0,99534294,2017-09-10,10,2,9,2017,envybox.io,0,1,1,1,99,3,1,11,16,0,1,1,0,77084
1,99534294,2017-10-15,15,3,10,2017,dom.co.il,0,1,1,1,99,3,1,11,16,0,1,1,0,77084
2,99534294,2017-10-15,15,3,10,2017,graniru.org,0,0,0,0,99,3,1,11,16,0,1,1,0,77084
3,99534294,2017-10-15,15,3,10,2017,yandex.net,0,1,1,1,99,3,1,11,16,0,1,1,0,77084
4,99534294,2017-10-15,15,3,10,2017,likebtn.com,0,1,1,1,99,3,1,11,16,0,1,1,0,77084


In [13]:
ddf.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 21 entries, machine_id to zip_code
dtypes: object(2), int64(19)

In [14]:
len(ddf)

49370796

In [15]:
ddf.machine_id.nunique().compute()

8379

In [22]:
ddf.number_pages_viewed.value_counts().compute()

0        44302037
1         1825574
2          855840
3          415894
4          286410
           ...   
3049            1
3045            1
3043            1
3042            1
35466           1
Name: number_pages_viewed, Length: 3596, dtype: int64