In [1]:
import re
import time
import requests
from io import StringIO
from datetime import datetime, timedelta

from IPython.display import display
from boto.s3.key import Key
import pandas as pd
import boto3



In [2]:
_metrics = "page_avg_scroll,page_avg_time,page_scroll_starts,page_total_time,page_uniques,page_views,page_views_loyal,page_views_quality"

def get_response(url, querystring, only_response=False):
    headers = {
        'cache-control': "no-cache"
    }
    response = requests.request(
        "GET", url, headers=headers, params=querystring
    )
    if only_response:
        return response
    
    return response.json() if response.ok else {'status': response.status_code}

def get_metrics(apikey, host, day):
    url = "http://api.chartbeat.com/query/v2/submit/page/"
    querystring = {
        "host": host,
        "apikey": apikey,
        "start": day,
        "end": day,
        "limit": 100000,
        "tz": "America/Lima",
        "dimensions": 'path,section,tz_day,tz_hour,tz_minute',
        "metrics": _metrics,
        "referrer_type": 'social,search,direct,internal,links'
    }
    return get_response(url, querystring)

def get_status(query_id, apikey, host):
    url = "http://api.chartbeat.com/query/v2/status/"
    querystring = {
        "query_id": query_id,
        "apikey": apikey,
        "host": host
    } 
    return get_response(url, querystring)

def get_report(query_id, apikey, host):
    url = 'http://api.chartbeat.com/query/v2/fetch/'
    querystring = {
        "query_id": query_id,
        "apikey": apikey,
        "host": host
    }
    return get_response(url, querystring, only_response=True)

In [3]:
host = "gestion.pe"
apikey = "094d70f05bf0f49f7ba8126e7ee62c76"
# day = "2019-02-28"

In [4]:
def get_df(query_id, apikey, host, day):
    # Obtengo el response del reporte
    report = get_report(query_id, apikey, host)
    
    set_parameter_csv = {'sep': ',', 'low_memory': False}
    df = pd.read_csv(StringIO(report.text), **set_parameter_csv)
    
    cols = list(df.columns)
    if 'path' in cols:
        patron_articulo = re.compile(r'-\d{2,}$')
        df['is_article'] = df.path.apply(
            lambda _: 1 if patron_articulo.search(_) and len(_.split('/')[-1]) > 12 else 0
        )
        
        df = df[df['is_article'] == 1]
        df['article_id'] = df.path.apply(lambda _: _.split('-')[-1])
        del df['is_article']
        df['dia'] = day
        
    for col in _metrics.split(','):
        if col in cols:
            df[col].fillna(0, inplace=True)
            df[col] = df[col].astype('int')

    print(list(df.columns))
    # df = spark.createDataFrame(df)
    return df

In [5]:
start = datetime.strptime("2019-03-01", "%Y-%m-%d").date()
end = datetime.strptime("2019-03-31", "%Y-%m-%d").date()
set_parameter_csv = {
    'sep': ',',
    'low_memory': False
}

for dayy in (start + timedelta(days=_) for _ in range(0, (end - start).days + 1)):
    month = str(dayy.month).zfill(2)
    year = dayy.year
    dayy = str(dayy)
    query_response = get_metrics(apikey, host, dayy)
    query_id = query_response.get('query_id')
    
    try:
        time.sleep(5)
        df = get_df(query_id, apikey, host, dayy)
        if df.shape[0] < 2:
            raise Exception("Small data, possible error 503")
        print('CONNECCION RAPIDA')
    except Exception as e:
        print('/'*20, str(e))
        try:
            time.sleep(60)
            df = get_df(query_id, apikey, host, dayy)
            print("CONNECCION PROLONGADA")
        except Exception as e:
            print('='*20, str(e))       
    
    # df_.repartition(1).write.mode("overwrite").csv('s3n://charbeat.dev/elcomercio/raw/chartbeat_{0}.csv'.format(dayy), header=True)
    
    # SAVE IN LOCAL
    df.to_csv(dayy + '.csv', index=False)
    continue
    
    # UP S3 AWS
    bucket = 'charbeat-trafic'
    path_aws = 'gestion.pe/{}/{}/{}.csv'.format(year, month, dayy)
    csv_buffer = StringIO()
    
    df.to_csv(csv_buffer, index=False)
    s3_resource = boto3.resource('s3')
    s3_resource.Object(bucket, path_aws).put(Body=csv_buffer.getvalue())
    print(dayy)
    
    #conn = boto3.s3.connect_to_region('us-east-1')  # or region of choice
    # ACCESO PUBLICO
    s3 = boto3.resource('s3')
    object_acl = s3.ObjectAcl(bucket, path_aws)
    response = object_acl.put(ACL='public-read')
    print('response: ',response)
    
    url_up = 'https://s3.amazonaws.com/{}/{}'.format(bucket, path_aws)
    df = pd.read_csv(url_up, **set_parameter_csv)
    display(df.head())
    display(df.isnull().sum())
    print("="*30)

['tz_time', 'path', 'section', 'page_avg_scroll', 'page_avg_time', 'page_scroll_starts', 'page_total_time', 'page_uniques', 'page_views', 'page_views_loyal', 'page_views_quality', 'article_id', 'dia']
CONNECCION RAPIDA
['tz_time', 'path', 'section', 'page_avg_scroll', 'page_avg_time', 'page_scroll_starts', 'page_total_time', 'page_uniques', 'page_views', 'page_views_loyal', 'page_views_quality', 'article_id', 'dia']
CONNECCION RAPIDA
['tz_time', 'path', 'section', 'page_avg_scroll', 'page_avg_time', 'page_scroll_starts', 'page_total_time', 'page_uniques', 'page_views', 'page_views_loyal', 'page_views_quality', 'article_id', 'dia']
CONNECCION RAPIDA
['tz_time', 'path', 'section', 'page_avg_scroll', 'page_avg_time', 'page_scroll_starts', 'page_total_time', 'page_uniques', 'page_views', 'page_views_loyal', 'page_views_quality', 'article_id', 'dia']
CONNECCION RAPIDA
['tz_time', 'path', 'section', 'page_avg_scroll', 'page_avg_time', 'page_scroll_starts', 'page_total_time', 'page_uniques', 