In [3]:
#import libraries
from elasticsearch import Elasticsearch
from datetime import *
from dateutil.relativedelta import *
import pandas as pd 
import numpy as np

ES_SERVER = "https://es-cdr-prod-client.dialogtech.com"
es = Elasticsearch(ES_SERVER)

davey_tree_csv = pd.read_csv('./data/DaveyTree.csv')
sids = davey_tree_csv['CALLID'].tolist()

#the two functions below takes the in and out details from the phone calls and calculates the number of times
#the call switches from one end to the other.
def get_durations(js):
    duration = 0
    for entry in js:
        duration += float(entry['duration'])
    return duration

def count_switches(js):
    ini, outi, count = 0, 0, 0
    iflag = True
    oc = js['out_detail']
    ic = js['in_detail']
    while ini < min(len(js['in_detail']), len(js['out_detail'])) and outi < min(len(js['in_detail']), len(js['out_detail'])):
        if iflag and float(oc[outi]['offset']) > float(ic[ini]['offset']):
            iflag = False
            ini += 1
            count += 1
        elif not iflag and float(ic[ini]['offset']) > float(oc[outi]['offset']):
            iflag = True
            outi += 1
            count += 1
        elif iflag:
            outi += 1
        else:
            ini += 1
    return count

data = {
        "query": {
            "terms": {
                "sid": sids
        }
    }
}

#searches for caller ID based on first 4 characters
es_indexes = []
for sid in sids:
    index = 'alias_cdr_calls_20{}'.format(sid[:4])
    if index not in es_indexes:
        es_indexes.append(index)


#because we are using such a large dataset (5000 entries), it is necessary to use a scan and scroll method.
#The scan search type and the scroll API are used together to retrieve large numbers of documents 
#The scroll parameter tells Elasticsearch how long it should keep the scroll open
res = es.search(index=es_indexes, scroll='5m', search_type='scan', size=5, body=data)
scroll_id = res['_scroll_id']
scroll_size = res['hits']['total']




sids = []
day_of_week = []
date_added_local = []
call_duration_seconds = []
in_detail = []
out_detail = []
switches = []


while (scroll_size > 0):
    print("Scrolling...")
    res  = es.scroll(scroll_id = scroll_id, scroll = '5m')
    hits = res['hits']['hits']

    for hit in hits:
        sids.append(hit['_source']['sid'])
        date_added_local.append(hit['_source']['date_added_local'])
        day_of_week.append(hit['_source']['day_of_week'])
        call_duration_seconds.append(hit['_source']['call_duration_seconds'])
        indur = get_durations(hit["_source"]['speech']['in_detail'])
        outdur = get_durations(hit["_source"]['speech']['out_detail'])
        in_detail.append(indur)
        out_detail.append(outdur)
        switches.append(count_switches(hit["_source"]['speech']))
    

        

    # Update the scroll ID
    scroll_id = res['_scroll_id']
    scroll_size = len(res['hits']['hits'])
    print("scroll size: " + str(scroll_size) + "\n")
print("End of scroll reached\n")

#create dataframe 
df = pd.DataFrame()

df['CALLID'] = sids
df['date'] = date_added_local
df['day_of_week'] = day_of_week
df['call_duration_seconds'] = call_duration_seconds
df['in_detail'] = in_detail
df['out_detail'] = out_detail
df['switches'] = switches


print(df)

#create a csv out of the dataframe
df.to_csv('data/voicemail_data.csv')

voicemail_csv = pd.read_csv('./data/voicemail_data.csv')
print(voicemail_csv)
print(davey_tree_csv)

#merge the csv with the training data and the newly created dataframe by merging on the 'CALLID'
merged = pd.merge(voicemail_csv, davey_tree_csv, on='CALLID', how='left')
merged.to_csv("output.csv", index=False)

print(merged)

ImproperlyConfigured: Root certificates are missing for certificate validation. Either pass them in using the ca_certs parameter or install certifi to use it automatically.