### Install necessary packages

In [1]:
import os
import tempfile
import pygsheets
import pandas as pd
import asyncio
from functools import partial
from gql import Client, gql
from gql.transport.aiohttp import AIOHTTPTransport
from utils import write_base64str_obj_to_file

### Retrieve data using GraphQL

In [2]:
# # Select your transport with a defined url endpoint
# transport = AIOHTTPTransport(url="https://gis-api.aiesec.org/graphql/?access_token=377a0dfb6e5f0126dbf76cab289c755c59e9578ef8acb9e2af72354e9a9a68a1")

# async def getData():
#     # Create a GraphQL client using the defined transport
#     async with Client(transport=transport, fetch_schema_from_transport=True) as session:

#         # Provide a GraphQL query
#         query = gql(
#             """
#             query getApplicationList ($limit: Int, $start_date: DateTime, $end_date: DateTime){
#             allOpportunityApplication(per_page: $limit, filters: {created_at: {from: $start_date, to: $end_date}}) {
#                 data {
#                 id
#                 status
#                 created_at
#                 date_matched
#                 date_pay_by_cash
#                 date_approved
#                 date_realized
#                 experience_start_date
#                 experience_end_date
#                 date_approval_broken
#                 nps_response_completed_at
#                 updated_at
#                 person {
#                     id
#                     full_name
#                     home_mc {
#                     name
#                     }
#                     home_lc {
#                     name
#                     }
#                 }
#                 host_lc {
#                     name
#                 }
#                 host_mc: home_mc {
#                     name
#                 }
#                 opportunity {
#                     id
#                     created_at
#                     title
#                     duration
#                     sub_product {
#                     name
#                     }
#                     programme {
#                     short_name_display
#                     }
#                 }
#                 standards {
#                     option
#                 }
#                 }
#             }
#             }
#         """
#         )

#         params = {	"mc_id": [518],
#                     "start_date": "2021-01-01",
#                     "end_date": "",
#                     "limit": 1000
#                 }

#         # Execute the query on the transport
#         results = await session.execute(query, variable_values=params)
#         # print(result)
#         return results

# results = await getData()

### Store response as a dataframe

In [2]:

from utils import get_config
config_vars = get_config()
os.environ.update(config_vars)

    # Select your transport with a defined url endpoint
access_token = os.environ['ACCESS_TOKEN']
transport = AIOHTTPTransport(url=f"https://gis-api.aiesec.org/graphql/?access_token={access_token}")




In [7]:
async def getData():
    # Create a GraphQL client using the defined transport
    async with Client(transport=transport, fetch_schema_from_transport=True) as session:

        # Provide a GraphQL query
        query = gql(
            """
            query getApplicationList ($limit: Int, $start_date: DateTime, $end_date: DateTime){
            allOpportunityApplication(per_page: $limit, filters: {created_at: {from: $start_date, to: $end_date}}) {
                data {
                id
                status
                created_at
                date_matched
                date_pay_by_cash
                date_approved
                date_realized
                experience_start_date
                experience_end_date
                date_approval_broken
                nps_response_completed_at
                updated_at
                person {
                    id
                    full_name
                    contact: contact_detail {
                        email
                        phone
                    }
                    home_mc {
                    name
                    }
                    home_lc {
                    name
                    }
                }
                host_lc {
                    name
                }
                host_mc: home_mc {
                    name
                }
                opportunity {
                    id
                    created_at
                    title
                    duration
                    sub_product {
                    name
                    }
                    programme {
                    short_name_display
                    }
                }
                standards {
                    option
                }
                }
            }
            }
        """
        )

        params = {	"mc_id": [518], # Bahrain's MC ID
                    "start_date": "2021-01-01",
                    "end_date": "",
                    "limit": 1000 # Could be any large enough number
                }

        # Execute the query on the transport
        results = await session.execute(query, variable_values=params)
        # print(result)
        return results
results = await getData()


## Process The Data

In [8]:
# print("Executing query off of EXPA ...")
# apps_data = asyncio.run(getData()) 

print("Started preprocessing...")
# Reduce the dict by 3 Levels
apps_data = results['allOpportunityApplication']['data']

#  Flatten dictionary and compress keys
apps_df = pd.json_normalize(apps_data, sep='_')



Started preprocessing...


In [9]:
apps_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 29 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   id                                        185 non-null    object 
 1   status                                    185 non-null    object 
 2   created_at                                185 non-null    object 
 3   date_matched                              12 non-null     object 
 4   date_pay_by_cash                          0 non-null      object 
 5   date_approved                             4 non-null      object 
 6   date_realized                             0 non-null      object 
 7   experience_start_date                     0 non-null      object 
 8   experience_end_date                       0 non-null      object 
 9   date_approval_broken                      0 non-null      object 
 10  nps_response_completed_at             

In [9]:
# Create new columns for Easy Reading and Indices
# * LC
# * Department
# * Partner_MC
# * Partner_LC

# Create new multi-indices for grouping
new_fields = ['department', 'lc', 'partner_mc', 'partner_lc']
def generate_new_fields(row):
    if row['person_home_mc_name'] == 'Bahrain':
        values = ['o' + row['opportunity_programme_short_name_display'],
                   row['person_home_lc_name'],
                   row['host_mc_name'], 
                   row['host_lc_name']
                 ]
    else:
        values = ['i' + row['opportunity_programme_short_name_display'],
                  row['host_lc_name'],
                  row['person_home_mc_name'],
                  row['person_home_lc_name']
                 ]
    return dict(zip(new_fields, values))

print("Generating new fields and tables ...")
apps_df[new_fields] = apps_df.apply(lambda row: generate_new_fields(row), axis=1, result_type='expand')

cols_to_drop = ['opportunity_programme_short_name_display', 'host_mc_name', 'host_lc_name', 'person_home_mc_name', 'person_home_lc_name']
apps_df.drop(cols_to_drop, inplace=True, axis=1)



Generating new fields and tables ...


In [10]:
apps_df

Unnamed: 0,id,status,created_at,date_matched,date_pay_by_cash,date_approved,date_realized,experience_start_date,experience_end_date,date_approval_broken,...,opportunity_created_at,opportunity_title,opportunity_duration,opportunity_sub_product,opportunity_sub_product_name,person_contact_detail,department,lc,partner_mc,partner_lc
0,6192026,open,2021-07-06T13:08:53Z,,,,,,,,...,2021-04-12T13:09:04Z,Raise Awareness about Gender Discrimination in...,,,,,oGV,Tylos,Turkey,ADANA
1,6192014,open,2021-07-06T12:38:45Z,,,,,,,,...,2020-11-07T09:15:04Z,Practice Foreigner Language | Discuss World Is...,,,,,oGV,Tylos,Turkey,ISTANBUL ASIA
2,6191987,open,2021-07-06T10:50:55Z,,,,,,,,...,2021-04-22T15:34:12Z,Yellow Energy,,,,,oGV,Tylos,Spain,Pompeu Fabra Barcelona
3,6191986,open,2021-07-06T10:47:32Z,,,,,,,,...,2021-06-25T16:38:52Z,GoTrade Program Manager Intern,,,Other,,oGTa,Tylos,Germany,BONN
4,6191985,open,2021-07-06T10:45:44Z,,,,,,,,...,2021-06-16T23:23:39Z,Customer Service and Sales,,,Business Administration,,oGTa,Tylos,Greece,NKUA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,6105751,withdrawn,2021-02-21T23:26:36Z,,,,,,,,...,2020-06-16T16:50:04Z,MARIAS l Fight for gender equality in Brazil,,,,,oGV,(Closed RCSI),Brazil,SAO CARLOS
181,6098568,rejected,2021-02-08T09:36:08Z,,,,,,,,...,2021-01-19T08:23:28Z,Student Experience & Lifestyle Executive,,,Business Development,,oGTa,Awal,Malaysia,"Taylor's University (TU) , Subang Jaya"
182,6097866,withdrawn,2021-02-06T08:07:58Z,,,,,,,,...,2020-11-24T18:00:02Z,Better Life | No Poverty l Egypt,,,,,oGV,Dilmun,Egypt,AAST In CAIRO
183,6097744,rejected,2021-02-05T19:44:21Z,,,,,,,,...,2020-11-24T17:46:28Z,Life without hunger,,,,,oGV,Dilmun,Egypt,AAST In CAIRO


In [23]:
"""
Produce Performance Analytics DataFrame
    * First convert dates from longform to YYYY-MM-DD
    * Group by Date, LC, Dept, PartnerMC, PartnerLC, and the metrics like # of Applications, Accepted etc.. will be the aggregation
"""

date_cols = ['created_at', 'date_matched', 'date_approved', 'date_realized', 'updated_at']
multi_indices = ['lc', 'department', 'partner_mc', 'partner_lc']
aggregration_fields = ['id', 'person_id']

# Generate table with these columns only
perf_table = apps_df[aggregration_fields + date_cols + multi_indices].copy()

# Ensure that dates are uniform and shortened
perf_table.loc[:,date_cols] = apps_df[date_cols].applymap(lambda x: x[:-10], na_action='ignore')

def get_timeseries_formetric(table: pd.DataFrame, other_fields: list, selected_date_col: str, metric_name: str) -> pd.DataFrame:
    table = table[[selected_date_col, *other_fields, *aggregration_fields]]
    _ = table.sort_values([selected_date_col, *other_fields])
    _['Status'] = metric_name
    _.rename(columns={selected_date_col: "date", 
                    "id": "Application", 
                    "person_id": "Person"}, inplace=True)


    return _.dropna(axis=0)

apps_per_day = get_timeseries_formetric(perf_table, multi_indices, "created_at", "Applied")
acc_per_day = get_timeseries_formetric(perf_table, multi_indices, "date_matched", "Accepted")
apd_per_day = get_timeseries_formetric(perf_table, multi_indices, "date_approved", "Approved")



In [24]:
apd_per_day

Unnamed: 0,date,lc,department,partner_mc,partner_lc,Application,Person,Status
120,2021-06-17,Dilmun,oGTa,United Arab Emirates,UAE (MC),6132934,4601348,Approved
137,2021-06-21,Dilmun,oGTa,United Arab Emirates,UAE (MC),6131830,4599643,Approved
62,2021-07-01,(Closed RCSI),oGTa,United Arab Emirates,UAE (MC),6181987,1002581,Approved
143,2021-07-05,Awal,oGTa,United Arab Emirates,UAE (MC),6131606,4599282,Approved


In [25]:
perf_analysis_df = pd.concat([apps_per_day, acc_per_day, apd_per_day])
perf_analysis_df.fillna("", inplace=True, axis=0)



In [26]:
perf_analysis_df

Unnamed: 0,date,lc,department,partner_mc,partner_lc,Application,Person,Status
184,2021-02-02,Tylos,oGTa,Germany,BONN,6096299,4531535,Applied
183,2021-02-05,Dilmun,oGV,Egypt,AAST In CAIRO,6097744,4534283,Applied
182,2021-02-06,Dilmun,oGV,Egypt,AAST In CAIRO,6097866,4534283,Applied
181,2021-02-08,Awal,oGTa,Malaysia,"Taylor's University (TU) , Subang Jaya",6098568,1725431,Applied
180,2021-02-21,(Closed RCSI),oGV,Brazil,SAO CARLOS,6105751,1002581,Applied
...,...,...,...,...,...,...,...,...
30,2021-07-05,Dilmun,oGV,Turkey,DENIZLI,6189329,2210151,Accepted
120,2021-06-17,Dilmun,oGTa,United Arab Emirates,UAE (MC),6132934,4601348,Approved
137,2021-06-21,Dilmun,oGTa,United Arab Emirates,UAE (MC),6131830,4599643,Approved
62,2021-07-01,(Closed RCSI),oGTa,United Arab Emirates,UAE (MC),6181987,1002581,Approved


In [None]:
# ### Push it to Google Sheets

# Credentials from service account file for Google Sheets
print("Creating temporary file for service account credentials...")

temp = tempfile.NamedTemporaryFile()
try:
    access_creds = os.environ['GOOGLE_CREDS']
    write_base64str_obj_to_file(access_creds, temp.name)
finally:
    gc = pygsheets.authorize(service_file=temp.name)
    temp.close()

print("Writing to Google Sheets...")
workbook = gc.open_by_key(os.environ["SPREADSHEET_ID"])

perf_worksheet = workbook.worksheet_by_title(os.environ["PerformanceSheet"])
applications_worksheet = workbook.worksheet_by_title(os.environ["ApplicationsSheet"])

# Create handy function to write to sheets
set_worksheet_todf = partial(pygsheets.Worksheet.set_dataframe, start="A1", copy_head=True)

set_worksheet_todf(perf_worksheet, perf_analysis_df)
set_worksheet_todf(applications_worksheet, apps_df)
print("Done!")

Create new columns for Easy Reading and Indices
* LC
* LC_ID*
* Department
* Partner_MC
* Partner_LC

*-Probably do need it for verification

In [6]:
# Create new multi-indices for grouping
new_cols = ['dept_prefix', 'lc', 'partner_mc', 'partner_lc']

def generate_new_fields(row):

    if row['person_home_mc_name'] == 'Bahrain':
        values = ['o', row['person_home_lc_name'],
                  row['host_mc_name'], row['host_lc_name']]
    else:
        values = ['i', row['host_lc_name'],
                  row['person_home_mc_name'], row['person_home_lc_name']]

    return dict(zip(new_cols, values))

results[new_cols] = results.apply(lambda row: generate_new_fields(row), axis=1, result_type='expand')
results[new_cols].head(10)

Unnamed: 0,dept_prefix,lc,partner_mc,partner_lc
0,o,Awal,Turkey,ESKISEHIR
1,o,Awal,Turkey,SAMSUN
2,i,Tylos,Turkey,EASTERN MEDITERRANEAN
3,o,Dilmun,Turkey,ESKISEHIR
4,o,Awal,France,MC France
5,o,Dilmun,Sri Lanka,COLOMBO CENTRAL
6,i,Tylos,Sri Lanka,NSBM
7,o,Awal,Turkey,SAMSUN
8,o,Awal,Brazil,SAO CARLOS
9,o,Tylos,Turkey,IZMIR


In [7]:
# Create a new field 'department' with incoming and outgoing labels as prefix
results['department'] = results.dept_prefix + results.opportunity_programme_short_name_display
results.drop('opportunity_programme_short_name_display', inplace=True, axis=1)
results['department']

0      oGV
1      oGV
2      iGV
3      oGV
4     oGTa
      ... 
93     oGV
94    oGTa
95     oGV
96     oGV
97    oGTa
Name: department, Length: 98, dtype: object

## Produce Performance Analytics DataFrame

* First convert dates from longform to YYYY-MM-DD
* Group by Date, LC, Dept, PartnerMC, PartnerLC, and the metrics like # of Applications, Accepted etc.. will be the aggregation

In [8]:
date_cols = ['created_at', 'date_matched', 'date_approved', 'date_realized', 'updated_at']
multi_indices = ['lc', 'department', 'partner_mc', 'partner_lc']
counting_by = ['id', 'person_id']

# Generate table with these columns only
perf_table = results[counting_by + date_cols + multi_indices].copy()

# Ensure that dates are uniform and shortened
perf_table.loc[:,date_cols] = results[date_cols].applymap(lambda x: x[:-10], na_action='ignore')


perf_table.head()


Unnamed: 0,id,person_id,created_at,date_matched,date_approved,date_realized,updated_at,lc,department,partner_mc,partner_lc
0,6169117,4652381,2021-05-24,,,,2021-05-25,Awal,oGV,Turkey,ESKISEHIR
1,6169114,4652381,2021-05-24,,,,2021-05-24,Awal,oGV,Turkey,SAMSUN
2,6166570,3021521,2021-05-19,,,,2021-05-25,Tylos,iGV,Turkey,EASTERN MEDITERRANEAN
3,6165784,4563347,2021-05-18,2021-05-24,,,2021-05-24,Dilmun,oGV,Turkey,ESKISEHIR
4,6162477,584671,2021-05-12,,,,2021-05-12,Awal,oGTa,France,MC France


In [9]:

def splitup_date_field(table: pd.DataFrame, remaining_fields: list, sel_date_col: str, metric_name: str):
    table = table[[sel_date_col, *remaining_fields, *counting_by]]
    _ = table.sort_values([sel_date_col, *remaining_fields])
    _.rename(columns={sel_date_col: "date", 
                     "id": metric_name+"~APP", 
                     "person_id": metric_name+"~PPL"}, inplace=True)
    
    
    return _.dropna(axis=0)


apps = splitup_date_field(perf_table, multi_indices, "created_at", "applications")
acc = splitup_date_field(perf_table, multi_indices, "date_matched", "accepted")

final = pd.concat([apps, acc])

### Push it to Google Sheets

In [10]:
# Credentials from service account file for Google Sheets
config_vars = read_env_variables()
print("Creating temporary file for service account credentials...")

temp = tempfile.NamedTemporaryFile()
try:
    access_creds = config_vars['GOOGLE_CREDS']
    
    write_base64str_obj_to_file(access_creds, temp.name)
finally:
    gc = pygsheets.authorize(service_file=temp.name)
    temp.close()

print("Uploading to Google Sheets...")
workbook = gc.open_by_key(config_vars["SPREADSHEET_ID"])
worksheet = workbook.worksheet_by_title(config_vars["SHEET_NAME"])

worksheet.set_dataframe(final, start='A1', copy_head=True)
print("Done!")

Creating temporary file for service account credentials...
Uploading to Google Sheets...
Done!
