In [1]:
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient.discovery import build
import pprint
import pandas as pd
import configparser

pp = pprint.PrettyPrinter()
config_parser = configparser.ConfigParser(delimiters=('='))

In [2]:
# To do:
# Add dimension filter option to request builder functions

# General functions
def get_service_client(scopes, api_name, api_version, service_key_file=None, credentials=None):
    
    if credentials:
        service_client = build(api_name, api_version, credentials=credentials)
    
    else:
        credentials = ServiceAccountCredentials.from_json_keyfile_name(service_key_file, scopes=scopes)
        service_client = build(api_name, api_version, credentials=credentials)
    
    return service_client

# Management API functions
def print_accounts(account_summary):
    for account in account_summary.get('items'):
        print(f"\nAccount: {account.get('name')}")
        for prop in account.get('webProperties'):
            print(f"\tProperty: {prop.get('name')}")
            for view in prop.get('profiles'):
                print(f"\t\tView: {view.get('name')} - {view.get('id')}")

def get_account_summary(management_client, print_accounts=None):
    """
    Requires that you pass a service client built on management API
    If you specify print_accounts=True, will log out account structure from account_summary object
    """
    account_summary = management_client.management().accountSummaries().list().execute()
    
    if print_accounts:
        print_accounts(account_summary)
    
    return account_summary

def get_view_ids(account_summary, account_name=None, prop_name=None):
# At some point make this searchable if user already knows account/prop name/id 
    view_ids = []    
    for account in account_summary.get('items'):
        for prop in account.get('webProperties'):
            for view in prop.get('profiles'):
                view_ids.append({"view_name": f"{account.get('name')} > {prop.get('name')} > {view.get('name')}",
                                 "view_id": view.get('id')})

    return view_ids

def get_segments(management_client, print_segments=None):
    segments = management_client.management().segments().list().execute()
    
    if print_segments:
        print_segment_names(segments)
    
    return segments

def print_segment_names(segments):
    for segment in segments.get('items'):
        print(f"{segment.get('name')} = {segment.get('segmentId')}")

def get_segment_ids(segment_name, segments):
    segment_ids = []
    for segment in segments.get('items'):
        if segment.get('name') == segment_name:
            segment_ids.append(segment.get('segmentId'))

    return segment_ids
        
def print_segment_def(segments, segment_id=None, segment_name=None):
    
    if segment_id:
        for segment in segments.get('items'):
            if segment.get('segmentId') == segment_id:
                print(f"{segment.get('name')}, {segment.get('segmentId')} = {segment.get('definition')}")
    
    elif segment_name:
        for segment in segments.get('items'):
            if segment.get('name') == segment_name:
                print(f"{segment.get('name')}, {segment.get('segmentId')} = {segment.get('definition')}")

# Reporting API functions
def build_request_parameters(view_id, start_date, end_date, sampling_level=None, segment=None, cohorts=None):
    # Will need to update this once we get around to adding cohorts, and advanced segments etc.
    request_parameters = {'dateRanges': [{'startDate': start_date,
                                          'endDate': end_date}],
                          'viewId': view_id}
    
    if sampling_level:
        request_parameters.update({'samplingLevel': sampling_level})
    
    # This works for one segment, but will require its own function for multiple
    elif segment:
        request_parameters.update({'segments': [
            {'segmentId': segment}
        ]})
        
    return request_parameters

def build_request_body(data_requests):
    
    request_body = {'reportRequests': []}
    for request in data_requests:
        
        formatted_request = {'metrics': [],
                            'dimensions': []}
        
        for metric in request.get('metrics'):
            formatted_request['metrics'].append({'expression': 'ga:'+ metric})
        
        for dimension in request.get('dimensions'):
            formatted_request['dimensions'].append({'name': 'ga:'+ dimension})
            
        request_body['reportRequests'].append(formatted_request)
    
    return request_body

def get_analytics_data(reporting_client, request_parameters, request_body):
    """
    Has bandaid built in to handle request_body formatting when segments are defined in request_parameters
    """
    for request in request_body['reportRequests']:
        request.update(request_parameters)
        
        if request_parameters.get('segments'):
            request.get('dimensions').append({'name': 'ga:segment'})

    response_data = reporting_client.reports().batchGet(
        body=request_body
        ).execute()

    return response_data

def response_to_dataframe(response_data):
    
    data_frames = []
    
    # Yank out the meta data for each report
    for report in response_data.get('reports', []):
        headers = report.get('columnHeader', {})
        dimension_headers = headers.get('dimensions', [])
        metric_details = headers.get('metricHeader', {}).get('metricHeaderEntries', [])
        
        # Run through to grab metric headers
        metric_headers = []
        for metric in metric_details:
            metric_headers.append(metric.get('name'))
        
        # Pull dimension headers and metric headers together
        column_headers = tuple(dimension_headers + metric_headers)
        
        # For each report, yank out the actual data values
        rows_list = []
        for row in report.get('data').get('rows'):
            
            # Creates a tuple for each row of raw data
            row_data = []
            for dim_value in row.get('dimensions'):
                row_data.append(dim_value)

            for metric_value in row.get('metrics')[0].get('values'):
                row_data.append(metric_value)

            rows_list.append(tuple(row_data))

        # Build a data frame out of the data and append to list for multiple reports
        df = pd.DataFrame(rows_list, columns=column_headers)
        data_frames.append(df)
    
    return data_frames

def remove_ga_prefix(string):
    result = string.split(':')[1]
    return result

def clean_strings(strings, operations):
    results = []
    for string in strings:
        for operation in clean_operations:
            string = operation(string)
        results.append(string)
    return results

In [4]:
config_parser.read('PythonFiles/config.ini')

scopes = config_parser.get('Service Account', 'scopes').split(',')
service_credentials_file = config_parser.get('Service Account', 'service_credentials_file')

In [5]:
management_client = get_service_client(scopes, 'analytics', 'v3', service_credentials_file)
account_summary = get_account_summary(management_client, print_accounts=False)
view_ids = get_view_ids(account_summary)
segments = get_segments(management_client, print_segments=False)
direct_segment_id = get_segment_ids('Direct Traffic', segments)

In [6]:
clean_operations = [str.strip, remove_ga_prefix, str.title]

view_id = view_ids[3].get('view_id')
start_date = config_parser.get('Report Parameters', 'start_date')
end_date = config_parser.get('Report Parameters', 'end_date')
segment = direct_segment_id[0]

data_requests = [
    {'metrics': ['sessions', 'pageviews'],
     'dimensions': ['country', 'browser']},
    {'metrics': ['sessions', 'pageviews', 'bounces'],
     'dimensions': ['fullReferrer', 'country', 'browser']}]

In [7]:
reporting_client = get_service_client(scopes, 'analyticsreporting', 'v4', service_credentials_file)
request_body = build_request_body(data_requests)
request_parameters = build_request_parameters(view_id, start_date, end_date, segment=segment)
response_data = get_analytics_data(reporting_client, request_parameters, request_body)
analytics_df = response_to_dataframe(response_data)

for df in analytics_df:
    df.columns = clean_strings(df.columns, clean_operations)

In [8]:
analytics_df = response_to_dataframe(response_data)

for df in analytics_df:
    df.columns = clean_strings(df.columns, clean_operations)

In [9]:
analytics_df[1]

Unnamed: 0,Fullreferrer,Country,Browser,Segment,Sessions,Pageviews,Bounces
0,(direct),Brazil,Firefox,Direct Traffic,2,2,2
