In [1]:
import random
from faker import Faker
from datetime import timedelta, datetime as dt
import numpy as np


In [2]:
n = 1000 # number of tickets

## Fields

Options for a number of fields. 

In [3]:
# As per assessment details
status_list = ["Open","Closed","Resolved","Waiting for Customer","Waiting for Third Party","Pending"]

In [4]:
# Properties as per freshdesk website 
# https://support.freshdesk.com/en/support/solutions/articles/226460-export-ticket-activities-from-your-helpdesk

activity_note_types = {0:"Reply",
                       1:"Forward",
                       2:"Reply_to_forward",
                       3:"Private_note",
                       4:"Public_note",
                       5:"Phone_note",
                       6:"Broadcast_note"}

activity_sources = {1:"Email",
                    2:"Portal",
                    3:"Phone",
                    4:"Forums",
                    5:"Twitter",
                    6:"Facebook",
                    7:"Chat",
                    8:"Mobihelp",
                    9:"Feedback widget",
                    10:"Outbound email",  
                    11:"E-commerce",
                    12: "Bot"  }

activity_priorities = {1:"Low",
                       2:"Medium",
                       3:"High",
                       4:"Urgent"}

## Functions

In [5]:
def contact_customer_check(status):
    if status in ["Closed","Resolved","Waiting for Customer"]:
        return True
    else:
        return random.choice([True, False])

In [6]:
def generate_ticket_ids(n, max_repeat):
    '''
    Return a numpy array of ticket ids based on the number of tickets (n) required
    Ticket IDs can be repeated but not more than the input max_repeat
    '''
    ticket_ids_options = np.arange(1,n+1)
#     assert len(ticket_ids_options) == n

    ticket_ids = np.random.choice(ticket_ids_options,size=n)
    dup_flag = 1
    while dup_flag == 1:
        ticket_uniques, unique_counts = np.unique(ticket_ids, return_counts=True) # get unique values and number of occurences

        # one ticket_id cannot have more than 6 activities
        if unique_counts.max() <= max_repeat: # if no value is repeated more than 6 times
            dup_flag = 0
        else: # need to re-generate those
            # find ticket_numbers with more than 6 occurences
            to_resample = ticket_uniques[np.argwhere(unique_counts > max_repeat)]

            # remove values with more than 5 occurences from sampling population
            to_remove = ticket_uniques[np.argwhere(unique_counts >= max_repeat)]
            ticket_ids_options = np.delete(ticket_ids_options,to_remove - 1)

            # for each value in to_resample, get indices in ticket_ids to resample
            idx_ticket_ids_to_resample = [np.where(ticket_ids == x)[0] for x in to_resample]
            idx_ticket_ids_to_resample = np.concatenate([np.delete(a,list(range(0,max_repeat))) 
                                                         for a in idx_ticket_ids_to_resample])

            # update ticket_ids
            for i in idx_ticket_ids_to_resample:
                ticket_ids[i] = np.random.choice(ticket_ids_options)
                
    return ticket_ids
    
        
        


## Data Generation

### Metadata

In [7]:
#============= generate metadata =============
metadata = {"metadata": {"start_at": " ".join([dt.strftime(dt.now() + timedelta(days = -1),"%d-%m-%Y %H:%M:%S"),"+0000"]),
                         "end_at" : " ".join([dt.strftime(dt.now(),"%d-%m-%Y %H:%M:%S"),"+0000"]), # assume extract ticket at set time
                         'activities_count' : n}}
metadata

{'metadata': {'start_at': '27-11-2022 11:14:01 +0000',
  'end_at': '28-11-2022 11:14:01 +0000',
  'activities_count': 1000}}

### Activities_data

In [8]:
ticket_ids_test = generate_ticket_ids(100,4)
test_uniques, test_counts = np.unique(ticket_ids_test, return_counts=True)
assert len(ticket_ids_test) == 100
assert test_counts.max() <= 4


In [9]:
#============= generate performed_at time =============
# # random time must be between start and end time in metadata
# fake = Faker()
# end_date_dt = dt.strptime(metadata['metadata']['end_at'],"%d-%m-%Y %H:%M:%S %z")
# start_date_dt = dt.strptime(metadata['metadata']['start_at'],"%d-%m-%Y %H:%M:%S %z")
# performed_at_dt = [fake.date_time_between(start_date=start_date_dt, end_date=end_date_dt) for i in range(0,n)]
# performed_at = [" ".join([dt.strftime(perform_dt,"%d-%m-%Y %H:%M:%S"),"+0000"]) for perform_dt in performed_at_dt]

#============= generate ticket ids =============
ticket_ids = generate_ticket_ids(n,len(status_list))

#============= generate performer type and ids =============
performer_type = ['user']*n
performer_id_list = list(range(149015,149015 + 5)) # assume 5 agents
performer_id = [random.choice(performer_id_list) for i in range(0,n)]


In [13]:
# for tickets of the same id, allocate none repeated status

ticket_uniques, ticket_counts = np.unique(ticket_ids, return_counts=True)

# for ticket_ids that appear more than once:
ticket_dupes = ticket_uniques[np.where(ticket_counts > 1)]
status_dupes = {ticket_id: random.sample(status_list,count) 
                for ticket_id, count in zip(ticket_dupes,ticket_counts[np.where(ticket_counts > 1)])}

# for ticket_ids appearing only once


{1: ['Waiting for Third Party', 'Waiting for Customer'],
 9: ['Waiting for Customer', 'Waiting for Third Party'],
 13: ['Open', 'Closed'],
 22: ['Closed', 'Waiting for Customer', 'Waiting for Third Party'],
 27: ['Closed', 'Open', 'Resolved', 'Pending'],
 31: ['Resolved', 'Pending'],
 32: ['Open', 'Waiting for Third Party', 'Resolved', 'Waiting for Customer'],
 35: ['Pending', 'Open'],
 36: ['Closed', 'Resolved'],
 39: ['Pending', 'Waiting for Third Party'],
 48: ['Waiting for Third Party', 'Pending', 'Closed'],
 49: ['Resolved', 'Closed'],
 50: ['Resolved', 'Open', 'Closed'],
 52: ['Waiting for Third Party', 'Waiting for Customer'],
 53: ['Pending', 'Waiting for Third Party'],
 56: ['Pending', 'Closed', 'Open', 'Resolved'],
 63: ['Waiting for Third Party', 'Waiting for Customer'],
 64: ['Open', 'Waiting for Third Party'],
 77: ['Waiting for Third Party', 'Open'],
 89: ['Resolved', 'Waiting for Customer'],
 91: ['Open', 'Pending'],
 94: ['Pending', 'Waiting for Customer', 'Waiting for 

In [None]:
# status of the same ticket id cannot be the same
# closed must be the latest one

status = [random.choice(status_list) for i in range(0,n)]



  |-- agent_id = performer_id,
  issue_type:,
  note: ,
  |-- id: ,
  |-- type: ,
  priority: ,
  requester

In [None]:
contact_customer = [contact_customer_check(x)for x in status]

shipping_address = ['N/A']*n
# assume there are 5 sources, unsure what it means
source = [random.randint(1,12) for i in range(0,n)]

In [None]:
fake = Faker()

In [None]:
# assume shipment date is on the same day as performed date
shipment_date = [dt.strftime(perform_dt,"%d %b, %Y") for perform_dt in performed_at_dt]

#============= generate group =============
group_list = ['refund','exchange','missing','delivery delay']
group = [random.choice(group_list) for i in range(0,n)]

#============= generate category and product fields ============= 
products = {
    'phone':['mobile','landline phone'],
    'tablets':['Apple','Samsung'],
    'computer': ['laptop','desktop PC'],
    'headphones': ['earphones','headphones']
}
category = [random.choice(list(products.keys())) for i in range(0,n)] # randomly choose a categoy
product = [random.choice(products[k]) for k in category] # randomly choose a product based on category

In [None]:
# if status == closed, dont show priority if status = closed
priority_list = [4,4,1,3]
