In [14]:
# Imports

import json, yaml
import numpy as np
import pandas as pd
import datetime
import math
from sqlalchemy.engine.url import URL
from sqlalchemy import create_engine
import socket

# Open connections

config_file = '/Users/annaartigas/ada/TarifaJusta/TJ_config.yml'
with open(config_file, 'r') as conf:
    config = yaml.load(conf)


db_bi = config['db_bi']
conn_bi = create_engine(URL('mysql', **db_bi))

In [15]:
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials


def get_service(api_name, api_version, scopes, key_file_location):
    """Get a service that communicates to a Google API.

    Args:
        api_name: The name of the api to connect to.
        api_version: The api version to connect to.
        scopes: A list auth scopes to authorize for the application.
        key_file_location: The path to a valid service account JSON key file.

    Returns:
        A service that is connected to the specified API.
    """

    credentials = ServiceAccountCredentials.from_json_keyfile_name(
            key_file_location, scopes=scopes)

    # Build the service object.
    service = build(api_name, api_version, credentials=credentials)

    return service


def get_first_profile_id(service):
    # Use the Analytics service object to get the first profile id.

    # Get a list of all Google Analytics accounts for this user
    accounts = service.management().accounts().list().execute()

    if accounts.get('items'):
        # Get the first Google Analytics account.
        account = accounts.get('items')[0].get('id')

        # Get a list of all the properties for the first account.
        properties = service.management().webproperties().list(
                accountId=account).execute()

        if properties.get('items'):
            # Get the first property id.
            property = properties.get('items')[0].get('id')

            # Get a list of all views (profiles) for the first property.
            profiles = service.management().profiles().list(
                    accountId=account,
                    webPropertyId=property).execute()

            if profiles.get('items'):
                # return the first view (profile) id.
                return profiles.get('items')[0].get('id')

    return None

def get_results(results):
    # results: query to be send to GA API

    if results:
        GA_data = results.get('rows')
        total_rows = results.get('totalResults')
        try:
            print('Sample percentage:', int(results.get('sampleSize'))/int(results.get('sampleSpace')),
                  ' from ', results.get('sampleSpace'))
        except:
            pass

    else:
        print('No results found')

    return (GA_data, total_rows)

In [17]:

##############################################################################################

# Query to be sent to GA API

def query_OPE_info(service, profile_id, start_date, end_date, start_index,
                   max_results4query = 10000):
    # Query info to GA API

    #print(start_date, end_date)
    return service.data().ga().get(
        ids='ga:' + profile_id,
        max_results=max_results4query,
        start_index=start_index,
        start_date=start_date,
        end_date=end_date,
        samplingLevel='HIGHER_PRECISION',
        # goal2Completions:CMB/CMN, goal19Completions: tarificacio
        metrics='ga:goal2Completions, ga:goal19Completions',
        dimensions='ga:dimension13, ga:dateHourMinute',
        # if ';' filter with AND, if ',' filter with OR
        filters='ga:goal2Completions>0,ga:goal19Completions>0',
        segment='users::condition::ga:country==Spain').execute()

def retrieve_GA_ope_data(start_date, last_date):

    scope = 'https://www.googleapis.com/auth/analytics.readonly'
    key_file_location = '/Users/annaartigas/ada/analytics/extract_GA_info/creds.json'
    socket.setdefaulttimeout(1200)  # set timeout to 20 minutes
    max_days4query = 1

    # Authenticate and construct service.
    service = get_service(
            api_name='analytics',
            api_version='v3',
            scopes=[scope],
            key_file_location=key_file_location)

    profile_id = get_first_profile_id(service)

    start_date = first_date
    start_index = 1
    GA_results = pd.DataFrame()

    while (start_date < last_date):
        end_date = start_date

        start_query = str(start_date.strftime('%Y-%m-%d'))
        end_query = str(end_date.strftime('%Y-%m-%d'))
        query_results = get_results(query_OPE_info(service, profile_id, start_query, end_query, start_index))

        total_rows = len(GA_results)
        GA_data = pd.DataFrame(query_results[0])
        GA_max_num_rows = query_results[1]

        GA_results = GA_results.append(GA_data)
        #print(len(GA_results))

        while ((total_rows + GA_max_num_rows) > len(GA_results)):
            start_index += len(GA_data)
            query_results = get_results(query_OPE_info(service, profile_id, start_query, end_query, start_index))
            GA_data = pd.DataFrame(query_results[0])
            GA_results = GA_results.append(GA_data)
            #print(len(GA_results))

        start_date = start_date + datetime.timedelta(days=max_days4query)
        start_index = 1

    GA_results.columns = ['cookie', 'datetime', 'CMB_CMN', 'tarification']
    return GA_results


first_date = datetime.date(2020, 3, 1)
last_date = datetime.date.today()
GA_data = retrieve_GA_ope_data(first_date, last_date)

Traceback (most recent call last):
  File "/Users/annaartigas/anaconda3/envs/billings/lib/python3.6/site-packages/googleapiclient/discovery_cache/__init__.py", line 36, in autodetect
    from google.appengine.api import memcache
ModuleNotFoundError: No module named 'google.appengine'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/annaartigas/anaconda3/envs/billings/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py", line 33, in <module>
    from oauth2client.contrib.locked_file import LockedFile
ModuleNotFoundError: No module named 'oauth2client.contrib.locked_file'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/annaartigas/anaconda3/envs/billings/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py", line 37, in <module>
    from oauth2client.locked_file import LockedFile
ModuleNotFoundError: No m

In [18]:
GA_data['day'] = GA_data['datetime'].apply(lambda x: str(x)[:4] + '-' + str(x)[4:6] + '-' + str(x)[6:8])
GA_data['CMB_CMN'] = GA_data['CMB_CMN'].apply(pd.to_numeric)
GA_data['tarification'] = GA_data['tarification'].apply(pd.to_numeric)

In [19]:
pre_OPE_GA = GA_data.groupby(['cookie', 'day']).agg({'CMB_CMN': 'sum', 'tarification': 'sum'}).reset_index()
pre_OPE_GA['day'] = pre_OPE_GA['day']
pre_OPE_GA['CMB_CMN'] = np.where(pre_OPE_GA['CMB_CMN'] > 0, 1, 0)
pre_OPE_GA['tarification'] = np.where(pre_OPE_GA['tarification'] > 0, 1, 0)
cmb_tarf = pre_OPE_GA.groupby('day').agg({'CMB_CMN': 'sum', 'tarification': 'sum'}).reset_index()

In [20]:
# Function to insert new data to database

def insertNewDataToDB(df, table, conn):

    # Find info already in DB
    already_in =  pd.read_sql(
        f"""
        SELECT * FROM {table}
        """, conn)
    # Find info not already in DB and insert
    df_notInDB = df[~df.isin(already_in)].dropna()
    if len(df_notInDB) > 0:
        df_notInDB.to_sql(name=table, con=conn, if_exists="append", index=False)
    
    return

insertNewDataToDB(cmb_tarf, 'OPE_GA', conn_bi)