In [1]:
import random
from faker import Faker
from datetime import timedelta, datetime as dt
import numpy as np


In [2]:
n = 1000 # number of tickets

In [3]:
fake = Faker()

## Fields

Options for a number of fields. 

In [4]:
# As per assessment details
status_list = ["Open","Closed","Resolved","Waiting for Customer","Waiting for Third Party","Pending"]

In [5]:
# Properties as per freshdesk website 
# https://support.freshdesk.com/en/support/solutions/articles/226460-export-ticket-activities-from-your-helpdesk

activity_note_types = {0:"Reply",
                       1:"Forward",
                       2:"Reply_to_forward",
                       3:"Private_note",
                       4:"Public_note",
                       5:"Phone_note",
                       6:"Broadcast_note"}

activity_sources = {1:"Email",
                    2:"Portal",
                    3:"Phone",
                    4:"Forums",
                    5:"Twitter",
                    6:"Facebook",
                    7:"Chat",
                    8:"Mobihelp",
                    9:"Feedback widget",
                    10:"Outbound email",  
                    11:"E-commerce",
                    12: "Bot"  }

activity_priorities = {1:"Low",
                       2:"Medium",
                       3:"High",
                       4:"Urgent"}

In [6]:
products = {
            'phone':['mobile','landline phone'],
            'tablets':['Apple','Samsung'],
            'computer': ['laptop','desktop PC'],
            'headphones': ['earphones','headphones']
}
issue_types = {
                "Pre_Sale_Question": ['question'],
                "Order_Question": ['question'],
                "Return": ['refund', "store credit", "exchange"],
                "Shipping": ['missing', 'delivery delay'],
                "Vendor": ['stock', 'price'],
                "Accounts": ['payment','address','login','registration'],
                "Product_Availability": ['low','not available']
                }
activity_priorities = [1,1,2,4,4,3,1]
performer_ids = np.arange(149015,149020) # assumer there are 5 agents
requesters = np.arange(1,n+1)


## Functions

In [7]:
def contact_customer_check(status):
    if status in ["Closed","Resolved","Waiting for Customer"]:
        return True
    else:
        return random.choice([True, False])

In [8]:
def generate_ticket_ids(n, max_repeat):
    '''
    Return a numpy array of ticket ids based on the number of tickets (n) required
    Ticket IDs can be repeated but not more than the input max_repeat
    '''
    ticket_ids_options = np.arange(1,n+1)
    ticket_ids = np.random.choice(ticket_ids_options,size=n) # generate ticket ids
    dup_flag = 1
    while dup_flag == 1:
        ticket_uniques, unique_counts = np.unique(ticket_ids, return_counts=True) # get unique values and number of occurences

        # one ticket_id cannot have more than 6 activities
        if unique_counts.max() <= max_repeat: # if no value is repeated more than 6 times
            dup_flag = 0
        else: # need to re-generate those
            # find ticket_numbers with more than 6 occurences
            to_resample = ticket_uniques[np.argwhere(unique_counts > max_repeat)]

            # remove values with more than 5 occurences from sampling population
            to_remove = ticket_uniques[np.argwhere(unique_counts >= max_repeat)]
            ticket_ids_options = np.delete(ticket_ids_options,to_remove - 1)

            # for each value in to_resample, get indices in ticket_ids to resample
            idx_ticket_ids_to_resample = [np.where(ticket_ids == x)[0] for x in to_resample]
            idx_ticket_ids_to_resample = np.concatenate([np.delete(a,list(range(0,max_repeat))) 
                                                         for a in idx_ticket_ids_to_resample])

            # update ticket_ids
            for i in idx_ticket_ids_to_resample:
                ticket_ids[i] = np.random.choice(ticket_ids_options)
                
    return ticket_ids
    

In [9]:
def generate_status(ticket_ids,status_list):
    '''
    Return a numpy array of statuses based on input ticket_ids array
    Tickets of the same ids will have different status
    '''
    status = np.empty(shape= ticket_ids.shape, dtype=object)
    ticket_uniques, ticket_counts = np.unique(ticket_ids, return_counts=True)

    # for tickets of the same id, allocate none repeated status
    ticket_dupes = ticket_uniques[np.where(ticket_counts > 1)]
    idx_ticket_dupes = np.concatenate([np.where(ticket_ids == x)[0] for x in ticket_dupes])
    status_dupes = np.concatenate([random.sample(status_list,count) 
                                   for count in ticket_counts[np.where(ticket_counts > 1)]])
    status[idx_ticket_dupes] = status_dupes

    # for ticket_ids appearing only once, randomly choose a status from status list
    ticket_single = ticket_uniques[np.where(ticket_counts == 1)]
    idx_ticket_single = np.concatenate([np.where(ticket_ids == x)[0] for x in ticket_single])
    status_single = np.random.choice(status_list,len(ticket_single))
    status[idx_ticket_single] = status_single
    
    return status

In [10]:
def get_priority(issue_type):
    '''
    Return ticket priority based on issue_type
    '''
    return activity_priorities[list(issue_types.keys()).index(issue_type)]

In [11]:
def transform_unique_array(unique_field,ticket_id):
    '''
    Transform array with per-unique-ticket values to array corresponding to ticket ID
    '''
    # generate empty output array of same shape as ticket_ids
    output = np.empty(shape=ticket_id.shape,dtype = unique_field.dtype)

    # group ticket_ids indices based on ticket_id value
    ticket_uniques, unique_counts = np.unique(ticket_id,return_counts=True)
    idx_ticket_ids = [np.where(ticket_id == x)[0] for x in ticket_uniques]

    # assign value to output
    for j,idx in enumerate(idx_ticket_ids):
        output[idx] = unique_field[j]
    return output

In [12]:
def fake_time_long_format(start_date_dt, end_date_dt):
    '''
    Return a fake date between input start date and end date.
    Both inputs must be of type datetime
    Output in string, format e.g. '28-11-2022 12:05:37 +0000'
    '''
    fake = Faker()
    fake_date_dt = fake.date_time_between(start_date=start_date_dt, end_date=end_date_dt)
    return " ".join([dt.strftime(fake_date_dt, "%d-%m-%Y %H:%M:%S"), "+0000"])

In [13]:
# function to create timestamp for each ticket
def get_time_from_status(status,metadata):
    '''
    Input is an array of status for one ticket
    Output is a numpy array of correspoding timestamp based on status
    '''
    # generate empty array of same shape as status
    output = np.empty(shape = status.shape,dtype = object)
    
    # convert metadata start and end dates to datetime format
    end_date_dt = dt.strptime(metadata['metadata']['end_at'],"%d-%m-%Y %H:%M:%S %z")
    start_date_dt = dt.strptime(metadata['metadata']['start_at'],"%d-%m-%Y %H:%M:%S %z")
    
    # closed status corresponds to latest timestamp
    if "Closed" in status:
        idx = np.argwhere(status =='Closed')[0][0] # idx type = integer
        output[idx] = fake_time_long_format(start_date_dt + timedelta(hours = 6),end_date_dt) # string format
        end_date_dt = dt.strptime(output[idx],"%d-%m-%Y %H:%M:%S %z") 
    
    # resolved corresponds to latest or 2nd latest timestamp
    if "Resolved" in status:
        idx = np.argwhere(status =='Resolved')[0][0] # idx type = integer
        output[idx] = fake_time_long_format(start_date_dt + timedelta(hours = 2),end_date_dt) # string format
        end_date_dt = dt.strptime(output[idx],"%d-%m-%Y %H:%M:%S %z") 
    
    # for other status, generate a random timestamp
    remaining_status = np.setdiff1d(status,np.array(['Closed','Resolved']))
    if len(remaining_status) > 0:
        output_idx = np.concatenate([np.where(status == x)[0] for x in remaining_status])
        output[output_idx] = [fake_time_long_format(start_date_dt,end_date_dt)
                              for i in range(len(remaining_status))]
    
    return output
        

In [14]:
def get_single_ship_date(metadata):
    end_date = dt.strptime(metadata['metadata']['start_at'],"%d-%m-%Y %H:%M:%S %z")
    start_date = "".join(["-",str(np.random.randint(0,14,1)[0]),"d"])
    return dt.strftime(fake.date_time_between(start_date=start_date, end_date=end_date),
                   '%d %b,%Y')

## Data Generation

### Metadata

In [15]:
#============= generate metadata =============
metadata = {"metadata": {"start_at": " ".join([dt.strftime(dt.now() + timedelta(days = -1),"%d-%m-%Y %H:%M:%S"),"+0000"]),
                         "end_at" : " ".join([dt.strftime(dt.now(),"%d-%m-%Y %H:%M:%S"),"+0000"]), # assume extract ticket at set time
                         'activities_count' : n}}
metadata

{'metadata': {'start_at': '27-11-2022 22:19:17 +0000',
  'end_at': '28-11-2022 22:19:17 +0000',
  'activities_count': 1000}}

### Activities_data

#### Generate Ticket IDs and Associated Status

In [16]:
ticket_ids_test = generate_ticket_ids(100,4)
test_uniques, test_counts = np.unique(ticket_ids_test, return_counts=True)
assert len(ticket_ids_test) == 100
assert test_counts.max() <= 4


In [17]:
#============= generate ticket ids =============
ticket_id = generate_ticket_ids(n,len(status_list))
ticket_uniques, unique_counts = np.unique(ticket_id,return_counts=True)
idx_ticket_ids = [np.where(ticket_id == x)[0] for x in ticket_uniques]

#============= generate ticket status =============
status = generate_status(ticket_id, status_list)


In [18]:
#============= generate performed_at timestamp =============
# get array of status for each unique ticket
status_p_ticket = [status[idx] for idx in idx_ticket_ids] # list of arrays
timestamps_p_ticket = np.concatenate([get_time_from_status(x,metadata) for x in status_p_ticket])

performed_at = transform_unique_array(timestamps_p_ticket,ticket_id)

#### Generate generic fields for each unique ticket ID

for the same ticket_ids, the following should be the same and can be generated together
["ticket_id","performer_type","performer_id","shipping_address","shipment_date",
 "category","issue_type","source","priority","group","agent_id","requester","product"]

In [19]:
# generate fields for each unique ticket ID
performer_type = np.array(['user']*len(ticket_uniques))
assert len(performer_type) == len(ticket_uniques)
performer_id = np.random.choice(performer_ids,len(ticket_uniques),replace=True)
assert len(performer_id) == len(ticket_uniques)
shipping_address = np.array(['N/A']*len(ticket_uniques))
category = np.random.choice(list(products.keys()),len(ticket_uniques),replace=True)
issue_type = np.random.choice(list(issue_types.keys()),len(ticket_uniques),replace=True)
assert len(issue_type) == len(ticket_uniques)
source = np.random.randint(1,12+1,size = len(ticket_uniques))
assert len(source) == len(ticket_uniques)
# priority based on issue_type
priority = np.array([get_priority(x) for x in issue_type]) # based on issue_type
assert len(priority) == len(ticket_uniques)
group = np.concatenate([np.random.choice(issue_types[k],1) for k in issue_type]) # based on issue_type
assert len(group) == len(ticket_uniques)
agent_id = performer_id
requester = np.random.choice(requesters,len(ticket_uniques),replace=True)
assert len(requester) == len(ticket_uniques)
product = np.concatenate([np.random.choice(products[k],1) for k in category]) # based on category
assert len(product) == len(ticket_uniques)
# shipment date depends on issue type
shipment_date_p_ticket = np.empty(ticket_uniques.shape,object)
idx = np.where((issue_type == 'Return') | (issue_type == 'Shipping'))[0]
shipment_date_p_ticket[idx] = [get_single_ship_date(metadata) for i in range(len(idx))]
assert len(shipment_date_p_ticket) == len(ticket_uniques)


In [20]:
performer_type = transform_unique_array(performer_type, ticket_id)
performer_id = transform_unique_array(performer_id, ticket_id)
shipping_address = transform_unique_array(shipping_address, ticket_id)
category = transform_unique_array(category, ticket_id)
issue_type = transform_unique_array(issue_type, ticket_id)
source = transform_unique_array(source, ticket_id)
priority = transform_unique_array(priority, ticket_id)
group = transform_unique_array(group, ticket_id)
agent_id = transform_unique_array(agent_id, ticket_id)
requester = transform_unique_array(requester, ticket_id)
product = transform_unique_array(product, ticket_id)
shipment_date = transform_unique_array(shipment_date_p_ticket, ticket_id)


In [25]:
contacted_customer = np.array([contact_customer_check(x) for x in status])

In [21]:
#============ check if data generated successfully ============
assert len(performer_id) == len(ticket_id)
assert len(shipping_address) == len(ticket_id)
assert len(category) == len(ticket_id)
assert len(issue_type) == len(ticket_id)
assert len(performer_type) == len(ticket_id)
assert len(source) == len(ticket_id)
assert len(priority) == len(ticket_id)
assert len(group) == len(ticket_id)
assert len(agent_id) == len(ticket_id)
assert len(requester) == len(ticket_id)
assert len(product) == len(ticket_id)
assert len(shipment_date) == len(ticket_id)

In [29]:
# [performed_at,ticket_id ,performer_type,performer_id,shipping_address,shipment_date,
# category,contacted_customer,issue_type,source,status,priority,group,agent_id,requester,product]

In [27]:
activities_data = []
for i in range(len(ticket_id)):
    activities_data.append({
                                "performed_at":performed_at[i],
                                "ticket_id":ticket_id[i],
                                "performer_type":performer_type[i],
                                "performer_id":performer_id[i],
                                "activity":{
                                    "shipping_address":shipping_address[i],
                                    "shipment_date":shipment_date[i],
                                    "category":category[i],
                                    "contacted_customer":contacted_customer[i],
                                    "issue_type":issue_type[i],
                                    "source":source[i],
                                    "status":status[i],
                                    "priority":priority[i],
                                    "group":group[i],
                                    "agent_id":agent_id[i],
                                    "requester":requester[i],
                                    "product":product[i]
                                }
                            })


In [None]:
# for each activity data:
# drop shipment date if it's none
# drop a number of fields
# change a few to notes