In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from entity_generation import generate_business_entities
from datetime import datetime, timedelta
from tqdm import tqdm
import hashlib

### Load the core data

In [2]:
ftl_df = pd.read_excel('ftl_items.xlsx', sheet_name='Sheet1')
ftl_df

Unnamed: 0,Food,Category,Supply_Chain
0,cilantro,Herbs (fresh),Farmed
1,parsley,Herbs (fresh),Farmed
2,basil,Herbs (fresh),Farmed
3,arugula,Leafy greens (fresh),Farmed
4,baby leaf,Leafy greens (fresh),Farmed
...,...,...,...
292,fuji apples,Fruit,Farmed
293,granny smith apples,Fruit,Farmed
294,blackberries,Fruit,Farmed
295,blueberries,Fruit,Farmed


In [3]:
entities_df = generate_business_entities()
endings = np.random.randint(10000,99999,size=len(entities_df))
glnList = entities_df['companyPrefix'].values
entities_df['gln']=[prefix + '.' + str(ending) for prefix,ending in zip(glnList,endings)]
entities_df

Unnamed: 0,businessType,businessName,primaryPhone,streetAddress,city,state,zip,companyPrefix,sizeWeight,gln
0,farm,Lucky Acres Farm,(685)589-1977,5839 Miller Knoll Apt. 647,Lake Claire,VT,15813,0978263,0.15,0978263.55869
1,processor,Global Provisions,630-098-9742,1641 Smith Flats,Port Jeffrey,CA,55846,1095844,0.15,1095844.16138
2,landBasedReceiver,Coastal Maritime,0317805125,18277 Anna Flats Suite 025,North Kimberly,MD,29947,1277599,0.15,1277599.54616
3,seafoodFarm,Blue Aquaculture,(191)502-4296,064 Gross Hill,Scottmouth,OR,95478,1081933,0.05,1081933.34224
4,packaging,Bio Containers,693-115-0356,27375 Melissa Parks Suite 010,New Charles,CT,48089,0854180,0.05,0854180.77779
...,...,...,...,...,...,...,...,...,...,...
9995,packaging,Innovative Solutions,4337741768,147 Ryan Camp,Lake Cheryltown,TX,19195,0836686,0.05,0836686.60125
9996,groceryNoTransform,Fine Bazaar,(941)490-6698,99239 Hinton Shore Suite 051,North Maria,GA,62773,1049326,0.05,1049326.82591
9997,grocery,Delicious Store,5821198730,65778 Bradley Forest,Finleymouth,DC,58390,1027220,0.05,1027220.68907
9998,seafoodFarm,Marine Aquaculture,671-981-2791,27098 Caitlin Court Suite 358,Lake Brendanton,RI,11404,0847217,0.15,0847217.61716


# Data Generation Functions

### Generate Supply Chain

In [6]:
def generate_supply_chain(ftl_item):
    #Initialize the supply chain variables
    chain = []

    farm = 'farm'
    field_packed = 'fieldPacked'
    packaging_processor = 'packaging'
    food_processor = 'processor'
    kill = 'stop'
    lbr = 'landBasedReceiver'
    direct_sale = 'directToConsumer'
    seaFarm = 'seafoodFarm'
    agg = 'aggregator'
    rest = 'restaurant'
    grocNoFood = 'groceryNoTransform'
    grocFood = 'grocery'
    dist = 'distributor'
    whole = 'wholesaler'


    #Determine if the food will be an empty node item - do this after Zac sends the rest of the supply chain info

    #Farmed Product Supply Chain Route
    if ftl_item.Supply_Chain.values[0] == 'Farmed':
        chain.append(farm)
        #Determine if field packed or processed
        pack_int = random.randint(0,100)
        if pack_int >= 50:
            chain.append(field_packed)
        else:
            chain.append(packaging_processor)

    #Created Product Supply Chain Route
    elif ftl_item.Supply_Chain.values[0] == 'Created':
        chain.append(food_processor)
        if ftl_item.Category.values[0] == 'Cheese' or ftl_item.Category.values[0] == 'Nut Butters':
            rand = random.randint(0,100)
            if rand > 50:
                chain.append(food_processor)
            #Determine Cheese Continued Manufacturing Route
            if ftl_item.Category.values[0] == 'Nut Butters':
                if rand > 75:
                    chain.append(food_processor)
                else:
                    chain.append(kill)
            else:
                chain.append(kill)

    #Fish Supply Chain Route
    elif ftl_item.Category.values[0] == 'Seafood':

        #Caught fish Supply Chain Route
        if ftl_item.Supply_Chain.values[0] == 'Caught':
            chain.append(lbr)
            #Fill in the rest here

        #Aquaculture Supply Chain Route
        if ftl_item.Supply_Chain.values[0] == 'Aquaculture':
            chain.append(seaFarm)

        #Determine if the food will go to an aggregator
        rand = random.randint(0,100)
        if rand > 50:
            chain.append(agg)

        #Determine if the food goes through a kill step or continues to get processed
        chain.append(food_processor)

        rand = random.randint(0,100)
        if rand > 50:
            chain.append(food_processor)

            if rand > 75:
                chain.append(food_processor)

            else:
                chain.append(kill)

        else:
            chain.append(kill)

            
    if chain[-1] != kill:
        #Initialize the retail options
        retail_options = [rest, grocFood, grocNoFood]
        
        #Sale Route - this will be the same for every food
        sale_int = random.randint(0,100)
        #Determine if it is sold direct-to-consumer
        if sale_int <= 7:
            sale_int = random.randint(0,100)
            if sale_int <= 27:
                chain.append(direct_sale)
            else:
                chain.append(random.choice(retail_options))
        else:
        
            #If not, do the indirect sales route
            route_1 = random.randint(0,100)
            if route_1 <= 50:
                chain.append(dist)
                route_2 = random.randint(0,100)
                if route_2 <= 20:
                    chain.append(whole)
                else:
                    chain.append(random.choice(retail_options))
                
            else:
                chain.append(food_processor)
                route_2 = random.randint(0,100)
                if route_2 <= 50:
                    chain.append(dist)
                    route_2 = random.randint(0,100)
                    if route_2 <= 20:
                        chain.append(whole)
                    else:
                        chain.append(random.choice(retail_options))
                else:
                    route_2 = random.randint(0,100)
                    if route_2 <= 20:
                        chain.append(whole)
                    else:
                        chain.append(random.choice(retail_options))
            if chain[-1] == whole:
                route_int = random.randint(0,100)
                if route_int < 50:
                    chain.append(random.choice(retail_options))

    return chain

### CTE Generation Functions

In [147]:
def generate_traceability_lot_code(companyPrefix, gtin, tLotData, timestamp):
    hash_input = f"{tLotData}{timestamp}"
    hash_value = hashlib.sha1(hash_input.encode()).hexdigest()
    lot_code = f"urn:epc:class:lgtin:{companyPrefix}.{gtin.split('.')[1]}.{hash_value[:17]}"
    return lot_code

In [148]:
def generate_reference_document_type_number(facility,event):
    reference_type_number = f"urn:epcglobal:epcis:{event}.{facility.companyPrefix}"
    return reference_type_number

In [149]:
field_name_list = ['Field',
              'Bed',
              'Acre',
              'Garden',
              'Grasslands',
              'Ranch',
              'Barn'
              ]

container_name_list = ['Pond',
                       'Pool',
                       'Tank',
                       'Cage']

def harvesting_cte(fake, ftl_item, farm, next_entity, field_name_list = field_name_list, container_name_list=container_name_list):
    #Determine date
    start_date = datetime.strptime('2023-06-01', '%Y-%m-%d')
    end_date = datetime.now()
    date_harvested = str(fake.date_between_dates(date_start=start_date, date_end=end_date))

    data_submitter = farm.businessName
    food_name = ftl_item.Food.values[0]
    quantity = fake.random_int(min=1, max=1000)
    recipient = next_entity.businessName
    unit_of_measure = fake.random_element(elements=('kg', 'g', 'lbs', 'Dozen'))
    farm_name = farm.businessName
    phone_number = farm.primaryPhone
    

    #Contamination
    cont_int = random.randint(0,6000)
    if cont_int == 1:
        contamination = 1
    else:
        contamination = 0 

    #Determine what field or container was used
    if ftl_item.Supply_Chain.values[0] == 'Farmed':
        field_name = random.choice(field_name_list) + ' ' + fake.random_letter() + str(random.randint(1,10))
        container = 'n/a'
    elif ftl_item.Supply_Chain.values[0] == 'Aquaculture':
        field_name = 'n/a'
        container = random.choice(container_name_list) + ' ' + str(random.randint(1,10))
    
    #Need to add location description of farm where it was harvested
    harvesting_info = {
        'dataSubmitter': data_submitter,
        'recipient' : recipient,
        'commodity': food_name,
        'quantity' : quantity,
        'unitOfMeasure' : unit_of_measure,
        'farmName' : farm_name,
        'fieldName' : field_name,
        'containerName' : container,
        'cteDate' : date_harvested,
        'phoneNumber' : phone_number,
        'contaminated' : contamination,  
        'gtin':farm.companyPrefix+'.'+str(random.randint(100000, 999999)),
        'sgln':farm.gln,
        'pgln':farm.gln,
        'eventID':farm.gln+'.'+str(random.randint(1000000, 9999999)),
        'parentID':''
    }

    return harvesting_info

In [150]:
def cooling_cte(harvesting_info, ftl_item, facility, next_entity):

    data_submitter = facility.businessName
    food_name = ftl_item.Food.values[0]
    quantity = harvesting_info['quantity']
    recipient = next_entity.businessName
    unit_of_measure = harvesting_info['unitOfMeasure']
    farm_name = harvesting_info['dataSubmitter']
    cooler_location = facility.businessName
    date_cooled = harvesting_info['cteDate']
    phone_number = facility.primaryPhone
    contaminated = harvesting_info['contaminated']

    if contaminated == 0:
        if random.randint(0,6000) == 1:
            contaminated = 1
    
    #Need to add location description of farm where it was harvested
    cooling_info = {
        'dataSubmitter': data_submitter,
        'recipient' : recipient,
        'commodity': food_name,
        'quantity' : quantity,
        'unitOfMeasure' : unit_of_measure,
        'coolerLocation' : cooler_location,
        'cteDate' : date_cooled,
        'harvesterName' : farm_name,
        'phoneNumber' : phone_number,
        'contaminated' : contaminated, 
        'gtin':harvesting_info['gtin'],
        'sgln':harvesting_info['sgln'],
        'pgln':facility.gln,
        'eventID':facility.gln+'.'+str(random.randint(1000000, 9999999)),
        'parentID':harvesting_info['eventID']
    }

    return cooling_info

In [151]:
def packaging_cte(fake, harvesting_info, cooling_info, ftl_item, facility):
    # List of packaging types
    packaging_type = ['Box', 'Bag', 'Crate', 'Can', 'Bottle', 'Jar', 'Pouch', 'Carton']

    data_submitter = facility.businessName
    package_type = random.choice(packaging_type)
    quantity = fake.random_int(min=1, max=1000)
    unit_of_measure = fake.random_element(elements=('kg', 'g', 'lbs', 'Dozen'))
    packaging_date = str((datetime.strptime(cooling_info['cteDate'], '%Y-%m-%d') + timedelta(days=random.randint(0,3))).date())
    tLotData = data_submitter + ftl_item.Food.values[0] + str(quantity)
    traceability_lot_code = generate_traceability_lot_code(facility.companyPrefix,cooling_info['gtin'],tLotData,packaging_date)
    product_description = harvesting_info['dataSubmitter'] + ' ' + harvesting_info['commodity'] + ', ' + str(fake.random_int(min=1, max= 50)) + unit_of_measure + ' case'
    contaminated = cooling_info['contaminated']

    if contaminated == 0:
        if random.randint(0,6000) == 1:
            contaminated = 1


    packaging_info = {
        'dataSubmitter': data_submitter,
        'commodity':ftl_item.Food.values[0],
        'dateFoodReceived' : packaging_date,
        'quantityReceived':harvesting_info['quantity'],
        'harvestingLocation':harvesting_info['dataSubmitter'],
        'harvestedField':harvesting_info['fieldName'], #For produce
        'harvestedContainer':harvesting_info['containerName'], #For Aquaculture
        'harvestedPhoneNumber':harvesting_info['phoneNumber'],
        'dateHarvested':harvesting_info['cteDate'],
        'coolingLocation':cooling_info['dataSubmitter'],
        'dateOfCooling':cooling_info['cteDate'],
        'traceabilityLotCode': traceability_lot_code,
        'productDescription':product_description,
        'quantity' : quantity,
        'unitOfMeasure':cooling_info['unitOfMeasure'],
        'packageType': package_type,
        'traceabilityLotCodeSourceLocation':facility.gln,
        'cteDate' : packaging_date,
        'referenceDocumentTypeNumber': generate_reference_document_type_number(facility,'IP WO'),
        'contaminated':contaminated,
        'gtin':cooling_info['gtin'],
        'sgln':cooling_info['pgln'],
        'pgln':facility.gln,
        'eventID':facility.gln+'.'+str(random.randint(1000000, 9999999)),
        'parentID':cooling_info['eventID']
    }

    return packaging_info

In [152]:
def shipping_cte(previous_cte, next_entity, facility):

    shippedDate = str((datetime.strptime(previous_cte['cteDate'], '%Y-%m-%d') + timedelta(days=random.randint(0,3))).date())
    productDescription = previous_cte['productDescription']
    contaminated = previous_cte['contaminated']
    bizTransactionType = random.choice(['ASN','DESADV','BOL','SHP'])

    if contaminated == 0:
        if random.randint(0,6000) == 1:
            contaminated = 1

    shipping_info = {
        'dataSubmitter': facility.businessName,
        'traceabilityLotCode': previous_cte['traceabilityLotCode'],
        'quantity': previous_cte['quantity'],
        'unitOfMeasure':previous_cte['unitOfMeasure'],
        'productDescription': productDescription,
        'subsequentLocation': next_entity.businessName,
        'previousSourceLocation': previous_cte['dataSubmitter'],
        'cteDate': shippedDate,
        'traceabilityLotCodeSourceLocation': previous_cte['traceabilityLotCodeSourceLocation'],
        'referenceDocumentTypeNumber': generate_reference_document_type_number(facility,bizTransactionType),
        'contaminated':contaminated,
        'gtin':previous_cte['gtin'],
        'sgln':previous_cte['pgln'],
        'pgln':facility.gln,
        'eventID':facility.gln+'.'+str(random.randint(1000000, 9999999)),
        'parentID':previous_cte['eventID']
    }

    return shipping_info

In [153]:
def receiving_cte(previous_cte, facility):
    receivingDate = str((datetime.strptime(previous_cte['cteDate'], '%Y-%m-%d') + timedelta(days=random.randint(0,3))).date())

    contaminated = previous_cte['contaminated']
    bizTransactionType = random.choice(['BOL','RECADV','RECEIPT','RCV'])
    

    if contaminated == 0:
        if random.randint(0,6000) == 1:
            contaminated = 1

    receiving_info = {
        'dataSubmitter': facility.businessName,
        'traceabilityLotCode': previous_cte['traceabilityLotCode'],
        'quantity': previous_cte['quantity'],
        'unitOfMeasure':previous_cte['unitOfMeasure'],
        'productDescription': previous_cte['productDescription'],
        'previousSourceLocation': previous_cte['dataSubmitter'],
        'receivingLocation': facility.businessName,
        'cteDate': receivingDate,
        'traceabilityLotCodeSourceLocation': previous_cte['traceabilityLotCodeSourceLocation'],
        'referenceDocumentTypeNumber': generate_reference_document_type_number(facility,bizTransactionType),
        'contaminated':contaminated,
        'gtin':previous_cte['gtin'],
        'sgln':previous_cte['pgln'],
        'pgln':facility.gln,
        'eventID':facility.gln+'.'+str(random.randint(1000000, 9999999)),
        'parentID':previous_cte['eventID']
    }

    return receiving_info

In [154]:
#Transformation
def transformation_cte(previous_cte, ftl_item, facility):
    
    #creating universal variables
    quantity = random.randint(1,1000)
    quantityUsed = random.randint(quantity,2000)
    unitOfMeasure = random.choice([ 'oz', 'lbs', 'kg'])
    dataSubmitter = facility.businessName
    tLotData = dataSubmitter + ftl_item.Food.values[0] + str(quantity)
    
    #generating transformation date and lot codes - this is dependent on whether there was a previous cte or not
    try: 
        transformedDate = str((datetime.strptime(previous_cte['cteDate'], '%Y-%m-%d') + timedelta(days=random.randint(0,3))).date())
        newGtin = facility.companyPrefix+'.'+str(random.randint(100000, 999999))
        traceabilityLotCode = generate_traceability_lot_code(facility.companyPrefix,newGtin,tLotData,transformedDate)
        oldTraceabilityLotCode = previous_cte['traceabilityLotCode']
        oldProductDescription = previous_cte['productDescription']
        previousUnitOfMeasure = previous_cte['unitOfMeasure']
        oldGtin = previous_cte['gtin']
        sgln = previous_cte['pgln']
        eventID = facility.gln+'.'+str(random.randint(1000000, 9999999))
        parentID = previous_cte['eventID']

        
    except:
        start_date = datetime.strptime('06/01/2023', '%m/%d/%Y')
        end_date = datetime.now()
        transformedDate = str(fake.date_between(start_date=start_date, end_date=end_date))
        oldProductDescription = ''
        #oldtLotData = dataSubmitter + ftl_item.Food.values[0] + str(random.randint(3000,10000))
        #oldCompanyPrefix = f"{random.choice(['06','07','08','09','10','11','12','13'])}{random.randint(10000, 99999)}"
        #oldTraceabilityLotCode = generate_traceability_lot_code(oldCompanyPrefix,oldGt,oldtLotData,start_date)
        oldTraceabilityLotCode = ''
        newGtin = facility.companyPrefix+'.'+str(random.randint(100000, 999999))
        traceabilityLotCode = generate_traceability_lot_code(facility.companyPrefix,newGtin,tLotData,transformedDate)
        previousUnitOfMeasure = random.choice([ 'oz', 'lbs', 'kg'])
        oldGtin = ''
        sgln = facility.gln
        eventID = facility.gln+'.'+str(random.randint(1000000, 9999999))
        parentID = ''
    
    #transforming foods
    #fruit
    if ftl_item.Category.values[0] == 'Fruit': 
        shortDescription = "Fresh Cut " + ftl_item.Food.values[0]
    elif ftl_item.Category.values[0] == 'Melons': 
        shortDescription = "Fresh Cut " + ftl_item.Food.values[0]
    elif ftl_item.Category.values[0] == 'Tropical Tree Fruits': 
        shortDescription = "Fresh Cut " + ftl_item.Food.values[0]
    
    #nut butter
    elif ftl_item.Category.values[0] == 'Nut Butter':
        shortDescription = ftl_item.Food.values[0] + " Butter"
   
    #salads
    elif ftl_item.Category.values[0] == 'Shell Eggs':
        shortDescription = "Egg Salad"
    elif ftl_item.Category.values[0] == 'Crustaceans': 
        shortDescription = "Seafood Salad"
    elif ftl_item.Category.values[0] == 'Leafy greens (fresh)': 
        shortDescription = "Pasta Salad"
    elif ftl_item.Category.values[0] == 'Peppers': 
        shortDescription = "Pasta Salad"
    elif ftl_item.Category.values[0] == 'Tomatoes': 
        shortDescription = "Pasta Salad"
    elif ftl_item.Category.values[0] == 'Cucumbers (fresh)': 
        shortDescription = "Pasta Salad"
    elif ftl_item.Category.values[0] == 'Herbs (fresh)': 
        shortDescription = random.choice(['Egg Salad','Potato Salad','Pasta Salad','Seafood Salad']) 
    
    #fish
    elif ftl_item.Category.values[0] == 'Seafood':
        if random.randint(0,100) < 15:
            shortDescription = "Smoked " + ftl_item.Food.values[0]
        else:
            shortDescription = ftl_item.Food.values[0] + " filet"
    
    #remaining foods
    else:
        shortDescription = ''
    
    #generates product description dependent on whether food has been transformed previously 
    if shortDescription != '' :
        productDescription = facility.businessName+ ' ' + shortDescription + ', ' +str(quantity) + unitOfMeasure + ' case'
    else:
        productDescription = facility.businessName+ ' ' + ftl_item.Food.values[0] + ', ' +str(quantity) + unitOfMeasure + ' case'
    
    #Contamination
    try:
        contaminated = previous_cte['contaminated']
    except:
        contaminated = 0

    if contaminated == 0:
        if random.randint(0,6000) == 1:
            contaminated = 1

    bizTransactionType = random.choice(['TRF','TE','ADJUSTMENT'])
    
    transformation_info = {
        'dataSubmitter': dataSubmitter,
        'oldTraceabilityLotCode': oldTraceabilityLotCode,
        'oldProductDescription':oldProductDescription,
        'quantityUsed':quantityUsed,
        'previousUnitOfMeasure':previousUnitOfMeasure,
        'traceabilityLotCode': traceabilityLotCode,
        'traceabilityLotCodeSourceLocation': facility.gln,
        'cteDate': transformedDate,
        'productDescription': productDescription,
        'quantity': quantity,
        'unitOfMeasure': unitOfMeasure,
        'referenceDocumentTypeNumber': generate_reference_document_type_number(facility,bizTransactionType),
        'contaminated':contaminated,
        'inputGtin':oldGtin,
        'gtin':newGtin,
        'sgln':sgln,
        'pgln':facility.gln,
        'shortDescription':shortDescription,
        'eventID':eventID,
        'parentID':parentID
    }

    return transformation_info

In [155]:
def first_land_based_receiver_cte(fake, ftl_item, facility):
    #Determine the dates of harvest and landing
    start_date = datetime.strptime('2023-06-01', '%Y-%m-%d')
    end_date = datetime.now()
    firstHarvestDate = fake.date_between_dates(date_start=start_date, date_end=end_date)
    secondHarvestDate = firstHarvestDate + timedelta(days=random.randint(2,10))

    dateLanded = secondHarvestDate + timedelta(days=random.randint(1,3))

    dataSubmitter =facility.businessName

    #Determine Harvest Location
    secondLine = 'Major Fishing Area ' + str(random.randint(1,10))

    pacific_states = ['WA','OR','CA','HI','AK']

    if facility.state in pacific_states:
        ocean = 'Pacific'
    else:
        ocean ='Atlantic'

    thirdLine = random.choice(['Northern', 'Southern', 'Central']) + ' ' + ocean 

    harvestDateAndLocation = str(firstHarvestDate) + ' - ' + str(secondHarvestDate) + '\n' + secondLine + '\n' + thirdLine

    #Determine the quantity and unit of measure
    quantity = random.randint(20,1000)
    unitOfMeasure = random.choice(['kg', 'lb'])

    #Determine the traceability lot code
    tLotData = dataSubmitter + ftl_item.Food.values[0] + str(quantity)
    gtin = facility.companyPrefix+'.'+str(random.randint(100000, 999999))
    traceability_lot_code = generate_traceability_lot_code(facility.companyPrefix,gtin,tLotData,str(dateLanded))

    #Contamination
    contaminated = 0
    if random.randint(0,2000) == 1:
        contaminated = 1


    first_land_based_receiver_info = {
        'dataSubmitter':dataSubmitter,
        'traceabilityLotCode':traceability_lot_code,
        'productDescription':ftl_item.Food.values[0],
        'quantity':quantity,
        'unitOfMeasure':unitOfMeasure,
        'harvestDateAndLocation':harvestDateAndLocation,
        'traceabilityLotCodeSourceLocation':facility.gln,
        'cteDate':str(dateLanded),
        'referenceDocumentTypeNumber': generate_reference_document_type_number(facility,'LANDING'),
        'contaminated':contaminated,
        'gtin':gtin,
        'sgln':facility.gln,
        'pgln':facility.gln,
        'eventID':facility.gln+'.'+str(random.randint(1000000, 9999999)),
        'parentID':''
    }

    return first_land_based_receiver_info

### Supply Chain Functions

In [156]:
def farm_function(fake, ftl_item, sc, entities, previous_cte, index):
    farm = entities.iloc[index]
    packaged_type = sc[sc.index('farm') + 1]

    try:
        next_entity = entities.iloc[index+1]

    except:
        next_entity = farm

    #Initialize the CTEs for the farm
    ctes = {}

    #Determine what the next entity is for the KDEs that happen on the farm
    if packaged_type == 'fieldPacked':
        ctes['harvesting'] = harvesting_cte(fake, ftl_item, farm, farm)
        ctes['cooling'] = cooling_cte(ctes['harvesting'], ftl_item, farm, farm)
        ctes['initialPackaging'] = packaging_cte(fake,ctes['harvesting'],ctes['cooling'],ftl_item,farm)
        ctes['shipping'] = shipping_cte(ctes['initialPackaging'],next_entity,farm)
    elif packaged_type == 'packaging':
        ctes['harvesting'] = harvesting_cte(fake, ftl_item, farm, next_entity)

    return ctes


In [157]:
def initial_fish_function(fake, ftl_item, sc, entities, previous_cte, index):
    category = ftl_item.Supply_Chain.values[0]

    ctes={}
    #Aquaculture route
    if category == 'Aquaculture':
        facility = entities.iloc[index]
        next_entity=facility
        ctes['harvesting'] = harvesting_cte(fake, ftl_item, facility, next_entity)
        ctes['cooling'] = cooling_cte(ctes['harvesting'], ftl_item, facility, next_entity)
        ctes['initialPackaging'] = packaging_cte(fake,ctes['harvesting'],ctes['cooling'],ftl_item,facility)

    #Wild Caught Route
    elif category =='Caught':
        facility = entities.iloc[index]
        ctes['firstLandBasedReceiving'] = first_land_based_receiver_cte(fake, ftl_item, facility)

    #Ship to the next entity
    last_cte = list(ctes.keys())[-1]
    ctes['shipping'] = shipping_cte(ctes[last_cte],next_entity=entities.iloc[index+1],facility=facility)

    return ctes 

In [158]:
def processing_plant_function(fake, ftl_item, sc, entities, previous_cte, index):

    facility = entities.iloc[index]
    try:
        next_entity = entities.iloc[index+1]
        kill = 0
    except:
        kill = 1

    # Initialize the CTEs for the processing plant
    ctes = {}

    #The path if it is a created product and the first step in the supply chain
    if index == 0:
        ctes['transformation'] = transformation_cte(previous_cte,ftl_item,facility)

    #The path if it is not the first step in the supply chain
    else:

        ctes['receiving'] = receiving_cte(previous_cte, facility)
        ctes['transformation'] = transformation_cte(ctes['receiving'], ftl_item, facility)

    #Determine if a kill step had happened, if not proceed
    if kill == 0:
        ctes['shipping'] = shipping_cte(ctes['transformation'], next_entity, facility)
    
    return ctes

In [159]:
def coolingpacking_function(fake, ftl_item, sc, entities, previous_cte, index):
    # Initialize the CTEs for the offsite cooling and packing facility
    facility = entities.iloc[index]

    try:
        next_entity = entities.iloc[index+1]
        kill = 0
    except:
        next_entity = 0
        kill = 1

    ctes = {}

    ctes['cooling'] = cooling_cte(previous_cte, ftl_item, facility, facility)
    ctes['initialPackaging'] = packaging_cte(fake,previous_cte,ctes['cooling'],ftl_item, facility)

    if kill == 0:
        ctes['shipping'] = shipping_cte(ctes['initialPackaging'], next_entity=next_entity, facility=facility)

    return ctes

In [160]:
def distributor_function(fake, ftl_item, sc, entities, previous_cte, index):
    facility = entities.iloc[index]
    next_entity = entities.iloc[index+1]

    # Initialize the CTEs for the processing plant
    ctes = {}

    ctes['receiving'] = receiving_cte(previous_cte, facility)

    ctes['shipping'] = shipping_cte(ctes['receiving'], next_entity, facility)

    return ctes

In [161]:
def wholesaler_function(fake, ftl_item, sc, entities, previous_cte, index):
    facility = entities.iloc[index]

    try:
        next_entity = entities.iloc[index+1]
        kill = 0

    except:
        kill = 1

    # Initialize the CTEs for the processing plant
    ctes = {}

    ctes['receiving'] = receiving_cte(previous_cte, facility)

    #Determine if there is going to be a transformation or not
    if random.randint(0,100) > 50:
        ctes['transformation'] = transformation_cte(ctes['receiving'], ftl_item, facility)

    if kill == 0:
        last_cte = list(ctes.keys())[-1]
        ctes['shipping'] = shipping_cte(ctes[last_cte], next_entity, facility)

    return ctes

In [162]:
def grocery_function(fake, ftl_item, sc, entities, previous_cte, index):
    facility = entities.iloc[index]

    # Initialize the CTEs for the processing plant
    ctes = {}

    ctes['receiving'] = receiving_cte(previous_cte, facility)

    #Determine if there is going to be a transformation or not
    if random.randint(0,100) < 10:
        ctes['transformation'] = transformation_cte(ctes['receiving'], ftl_item, facility)

    return ctes

In [163]:
def restaurant_function(fake, ftl_item, sc, entities, previous_cte, index):
    facility = entities.iloc[index]

    # Initialize the CTEs for the processing plant
    ctes = {}

    ctes['receiving'] = receiving_cte(previous_cte, facility)

    #Determine if there is going to be a transformation or not
    if random.randint(0,100) > 95:
        ctes['transformation'] = transformation_cte(ctes['receiving'], ftl_item, facility)

    return ctes

In [164]:
def grocery_no_transform_function(fake, ftl_item, sc, entities, previous_cte, index):
    facility = entities.iloc[index]

    # Initialize the CTEs for the processing plant
    ctes = {}

    ctes['receiving'] = receiving_cte(previous_cte, facility)

    return ctes

# Generate the Data

In [165]:
def generate_data(ftl_df, entities_df, n=10000):
    fake = Faker()

    #Create a dictionary of the functions so that they can be called in the supply chain based on the type of entity
    functions_dict = {
        'farm':farm_function,
        'wholesaler':wholesaler_function,
        'grocery':grocery_function,
        'groceryNoTransform':grocery_no_transform_function,
        'distributor':distributor_function,
        'packaging':coolingpacking_function,
        'restaurant':restaurant_function,
        'processor':processing_plant_function,
        'landBasedReceiver':initial_fish_function,
        'seafoodFarm':initial_fish_function
    }

    all_ctes = []
    for _ in tqdm(range(n)):
        #Randomly select a food item and generate the supply chain
        food_item = ftl_df.sample()
        sc = generate_supply_chain(food_item)

        #Determine the entities for the supply chain
        indexes = []
        for entity_type in sc:
            try:
                entity = entities_df[entities_df.businessType == entity_type].sample(weights='sizeWeight',replace=True).index.values[0]
                indexes.append(entity)
            except:
                pass

        entities = entities_df.iloc[indexes].reset_index(drop=True)

        #Run the function for each entity in the supply chain
        #Note: the input for each function will be (fake, food_item, sc, entities, previous_cte, index)
        #A standardized input makes it easy to iterate through and call each function
        #In plain language, it is calling an instance of faker, the current food_item, the supply chain, the entities in the supply chain, the most recent CTE, and the index
        ctes = []
        for index in entities.index:
            try:
                previous_cte_name = list(ctes[-1].keys())[-1]
                previous_cte = ctes[-1][previous_cte_name]
            except:
                previous_cte = []

            ctes.append(functions_dict[entities.iloc[index].businessType](fake, food_item, sc, entities,previous_cte,index))
        
            

        all_ctes.extend(ctes)

    return all_ctes

In [166]:
#Cross contaminate function
def cross_contaminate(dfs):
    cross_contamination_probability = random.choice([3,5,8])
    later_ctes = [
        'shipping',
        'receiving'
    ]

    for i in dfs['transformation'][dfs['transformation'].contaminated == 1].index:

        row = dfs['transformation'].iloc[i]
        facility = row.pgln

        #Determine dates of possible contamination
        try:
            start_date = datetime.strptime(str(row.cteDate),'%Y-%m-%d')
        except:
            start_date = row.cteDate
        try:
            end_date = datetime.strptime(dfs['shipping'][dfs['shipping'].traceabilityLotCode == row.traceabilityLotCode].cteDate)
        except:
            end_date = start_date + timedelta(days=3)

        #Filter the data for rows that were possibly impacted by the contamination
        filterData = dfs['transformation']
        filterData['cteDate'] = pd.to_datetime(filterData['cteDate'])
        impacted = filterData[(filterData.pgln == facility)&(filterData.cteDate >= start_date)&(filterData.cteDate <= end_date)]
        if len(impacted) > 0:

            #Determine if it will spread to the node or not
            infected = []
            infectedLots = []
            for record in impacted.index:
                if random.randint(0,10) < cross_contamination_probability:
                    infectedGroup = [record]
                    infectedLot = impacted.loc[record].traceabilityLotCode
                    infectedLots.append(infectedLot)
                    infectedGroup.extend(filterData[filterData.oldTraceabilityLotCode == infectedLot].index)
                    infected.extend(list(set(infectedGroup)))

            #Spread the infection to the rows
            dfs['transformation'].loc[infected,'contaminated'] = 1

            #Spread the infection to every row in all other CTEs that were impacted
            for cte in later_ctes:
                dfs[cte].loc[dfs[cte][dfs[cte].traceabilityLotCode.isin(infectedLots)].index,'contaminated'] = 1
    return dfs

In [192]:
def add_epcis_formatting(cte_data):
    for name in list(cte_data.keys()):
        cte_data[name]['gtin'] = 'urn:epc:idpat:sgtin:' + cte_data[name]['gtin']
        cte_data[name]['sgln'] = 'urn:epc:id:sgln:' + cte_data[name]['sgln']
        cte_data[name] ['pgln'] = 'urn:epc:id:pgln:' + cte_data[name]['pgln']
    return cte_data

In [194]:
#Create CSV files of the data
def create_dfs(data, create_csv = False):
    cte_data = {
        'harvesting' : [],
        'cooling' : [],
        'initialPackaging' : [],
        'firstLandBasedReceiving' : [],
        'shipping' : [],
        'receiving' : [],
        'transformation' : []
    }


    for entity in data:
        for type in list(entity.keys()):
            cte_data[type].append(entity[type]) 

    for event in list(cte_data.keys()):
        cte_data[event] = pd.DataFrame(cte_data[event])
    
    #Cross contaminate
    cte_data = cross_contaminate(cte_data)

    #Add EPCIS formatting
    cte_data = add_epcis_formatting(cte_data)

    #Create a csv of data
    if create_csv == True:
        for event in list(cte_data.keys()):
            cte_data[event].to_csv(f'{event}.csv',index=False)

    return cte_data

In [195]:
fake = Faker()
data = generate_data(ftl_df, entities_df)

100%|██████████| 10000/10000 [01:14<00:00, 134.13it/s]


In [169]:
#Count number of Rows Generated
rows = 0
for entity in data:
    for row in list(entity.keys()):
        rows += 1

print(rows)

71137


In [196]:
#Cross-contaminate and create dfs
dfs = create_dfs(data)

In [199]:
dfs['receiving']

Unnamed: 0,dataSubmitter,traceabilityLotCode,quantity,unitOfMeasure,productDescription,previousSourceLocation,receivingLocation,cteDate,traceabilityLotCodeSourceLocation,referenceDocumentTypeNumber,contaminated,gtin,sgln,pgln,eventID,parentID
0,Reliable Network,urn:epc:class:lgtin:1045067.664452.4a7e196e16b...,2,kg,"Peaceful Acres Farm blueberries, 46kg case",Fresh Encase,Reliable Network,2023-07-24,1045067.18948,urn:epcglobal:epcis:BOL.1234303,0,urn:epc:idpat:sgtin:0845569.664452,urn:epc:id:sgln:1045067.18948,urn:epc:id:pgln:1234303.60155,1234303.60155.4356285,1045067.18948.4099831
1,Wholesome Traders,urn:epc:class:lgtin:1045067.664452.4a7e196e16b...,2,kg,"Peaceful Acres Farm blueberries, 46kg case",Reliable Network,Wholesome Traders,2023-07-24,1045067.18948,urn:epcglobal:epcis:BOL.1133165,0,urn:epc:idpat:sgtin:0845569.664452,urn:epc:id:sgln:1234303.60155,urn:epc:id:pgln:1133165.35379,1133165.35379.5063068,1234303.60155.7868436
2,Charming Restaurant,urn:epc:class:lgtin:1045067.664452.4a7e196e16b...,2,kg,"Peaceful Acres Farm blueberries, 46kg case",Wholesome Traders,Charming Restaurant,2023-07-26,1045067.18948,urn:epcglobal:epcis:BOL.1082462,0,urn:epc:idpat:sgtin:0845569.664452,urn:epc:id:sgln:1133165.35379,urn:epc:id:pgln:1082462.66251,1082462.66251.9330170,1133165.35379.3592677
3,Superior Trade,urn:epc:class:lgtin:1321040.195476.e14595c28b6...,5,Dozen,"Lucky Grove Farm beefsteak tomato, 28kg case",Lucky Grove Farm,Superior Trade,2023-07-28,1321040.88935,urn:epcglobal:epcis:BOL.0724520,0,urn:epc:idpat:sgtin:1321040.195476,urn:epc:id:sgln:1321040.88935,urn:epc:id:pgln:0724520.53414,0724520.53414.2600064,1321040.88935.2947352
4,Premium Provisions,urn:epc:class:lgtin:1321040.195476.e14595c28b6...,5,Dozen,"Lucky Grove Farm beefsteak tomato, 28kg case",Superior Trade,Premium Provisions,2023-08-03,1321040.88935,urn:epcglobal:epcis:RECADV.1165097,0,urn:epc:idpat:sgtin:1321040.195476,urn:epc:id:sgln:0724520.53414,urn:epc:id:pgln:1165097.48869,1165097.48869.8940123,0724520.53414.5514317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19808,Wholesome Kitchen,urn:epc:class:lgtin:0731428.918032.f676e05cc29...,315,oz,"Quality Processing jack fish filet, 315oz case",Quality Processing,Wholesome Kitchen,2023-06-18,0731428.39016,urn:epcglobal:epcis:RECADV.0735287,0,urn:epc:idpat:sgtin:0731428.918032,urn:epc:id:sgln:0731428.39016,urn:epc:id:pgln:0735287.39914,0735287.39914.1316867,0731428.39016.5106151
19809,Global Industries,urn:epc:class:lgtin:0735287.922580.da8d79c7d8a...,844,oz,"Wholesome Kitchen jack fish filet, 844oz case",Wholesome Kitchen,Global Industries,2023-06-23,0735287.39914,urn:epcglobal:epcis:BOL.1157130,0,urn:epc:idpat:sgtin:0735287.922580,urn:epc:id:sgln:0735287.39914,urn:epc:id:pgln:1157130.88285,1157130.88285.8716809,0735287.39914.7774224
19810,Quality Distribution,urn:epc:class:lgtin:1157130.138706.65d7f5dc628...,888,lbs,"Global Industries jack fish filet, 888lbs case",Global Industries,Quality Distribution,2023-06-28,1157130.88285,urn:epcglobal:epcis:RECEIPT.0820503,0,urn:epc:idpat:sgtin:1157130.138706,urn:epc:id:sgln:1157130.88285,urn:epc:id:pgln:0820503.74718,0820503.74718.6771749,1157130.88285.5672932
19811,Premium Mart,urn:epc:class:lgtin:1157130.138706.65d7f5dc628...,888,lbs,"Global Industries jack fish filet, 888lbs case",Quality Distribution,Premium Mart,2023-07-02,1157130.88285,urn:epcglobal:epcis:RECADV.1280759,0,urn:epc:idpat:sgtin:1157130.138706,urn:epc:id:sgln:0820503.74718,urn:epc:id:pgln:1280759.84344,1280759.84344.8333983,0820503.74718.3645057


In [171]:
dfs['harvesting'].contaminated.value_counts()

contaminated
0    6293
1       1
Name: count, dtype: int64

In [172]:
dfs['cooling'].contaminated.value_counts()

contaminated
0    6291
1       3
Name: count, dtype: int64

In [173]:
dfs['initialPackaging'].contaminated.value_counts()

contaminated
0    6291
1       3
Name: count, dtype: int64

In [174]:
dfs['receiving'].contaminated.value_counts()

contaminated
0    19798
1       46
Name: count, dtype: int64

In [175]:
dfs['shipping'].contaminated.value_counts()

contaminated
0    19843
1       45
Name: count, dtype: int64

In [176]:
dfs['transformation'].contaminated.value_counts()

contaminated
0    11771
1       38
Name: count, dtype: int64

In [177]:
dfs['firstLandBasedReceiving'].contaminated.value_counts()

contaminated
0    714
Name: count, dtype: int64

In [178]:
dfs['harvesting'].pgln.value_counts()

pgln
1164657.48109    32
1021954.37202    30
1139608.83657    30
1131814.70533    30
1198648.68229    30
                 ..
0990876.70106     1
1222055.39228     1
0950594.41027     1
1316938.67036     1
0840157.55703     1
Name: count, Length: 1453, dtype: int64

In [179]:
dfs['cooling'].pgln.value_counts()

pgln
1047437.14877    23
1371664.42873    22
0777023.56260    21
1172921.21685    20
0722642.35822    19
                 ..
1331975.26328     1
1034203.57881     1
1318476.91616     1
0786216.78236     1
0939624.79031     1
Name: count, Length: 1959, dtype: int64

In [180]:
dfs['transformation'].pgln.value_counts()

pgln
0839887.63535    81
0846993.35888    79
0855544.11585    74
1279073.44003    72
1028931.67705    72
                 ..
1069746.83140     1
1160537.63408     1
1194427.43096     1
1324465.67947     1
1116902.40956     1
Name: count, Length: 1561, dtype: int64

In [181]:
dfs['transformation'].traceabilityLotCode[0]

'urn:epc:class:lgtin:0798305.997711.70161264582929aeb'

In [182]:
import re

pattern = r'^urn:epc:class:lgtin:([0-9]{7})\.([0-9]{6})\.([!%-?A-Z_a-z\x22]{1,20})$'

# Example usage
test_string = dfs['transformation'].traceabilityLotCode[0]

match = re.match(pattern, test_string)
if match:
    print("Match found!")
    print("Company Prefix:", match.group(1))
    print("Item Reference:", match.group(2))
    print("Additional Information:", match.group(3))
else:
    print("No match.")

Match found!
Company Prefix: 0798305
Item Reference: 997711
Additional Information: 70161264582929aeb


In [185]:
dfs['firstLandBasedReceiving']

Unnamed: 0,dataSubmitter,traceabilityLotCode,productDescription,quantity,unitOfMeasure,harvestDateAndLocation,traceabilityLotCodeSourceLocation,cteDate,referenceDocumentTypeNumber,contaminated,gtin,sgln,pgln,eventID,parentID
0,Deep Seafoods,urn:epc:class:lgtin:1289027.831784.a844d7a4a38...,haddock,331,kg,2023-06-13 - 2023-06-22\nMajor Fishing Area 9\...,1289027.31058,2023-06-24,urn:epcglobal:epcis:LANDING.1289027,0,1289027.831784,1289027.31058,1289027.31058,1289027.31058.9181987,
1,Wave Catchers,urn:epc:class:lgtin:1146775.399123.4393c51dbd1...,mackerel,611,kg,2023-07-06 - 2023-07-10\nMajor Fishing Area 3\...,1146775.27475,2023-07-12,urn:epcglobal:epcis:LANDING.1146775,0,1146775.399123,1146775.27475,1146775.27475,1146775.27475.3580619,
2,Marine Fisheries,urn:epc:class:lgtin:0837068.267604.92c1c823136...,mahi mahi,678,kg,2023-07-20 - 2023-07-23\nMajor Fishing Area 2\...,0837068.91050,2023-07-24,urn:epcglobal:epcis:LANDING.0837068,0,0837068.267604,0837068.91050,0837068.91050,0837068.91050.6628206,
3,Deep Catches,urn:epc:class:lgtin:1046658.774700.2af3bb3a5d5...,pollock,899,lb,2023-06-06 - 2023-06-10\nMajor Fishing Area 9\...,1046658.45534,2023-06-11,urn:epcglobal:epcis:LANDING.1046658,0,1046658.774700,1046658.45534,1046658.45534,1046658.45534.1667904,
4,Marine Coast,urn:epc:class:lgtin:0666043.610543.37f30adeb7a...,swordfish,205,kg,2023-07-17 - 2023-07-27\nMajor Fishing Area 8\...,0666043.36941,2023-07-30,urn:epcglobal:epcis:LANDING.0666043,0,0666043.610543,0666043.36941,0666043.36941,0666043.36941.9439870,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709,Wave Bay,urn:epc:class:lgtin:1025981.995858.2d81bfc3cd2...,snapper,204,kg,2023-06-15 - 2023-06-22\nMajor Fishing Area 9\...,1025981.78437,2023-06-23,urn:epcglobal:epcis:LANDING.1025981,0,1025981.995858,1025981.78437,1025981.78437,1025981.78437.7615359,
710,Tidal Coast,urn:epc:class:lgtin:1392629.797267.03b5dcf5a2f...,crab,229,lb,2023-06-02 - 2023-06-12\nMajor Fishing Area 9\...,1392629.78551,2023-06-15,urn:epcglobal:epcis:LANDING.1392629,0,1392629.797267,1392629.78551,1392629.78551,1392629.78551.8944897,
711,Fresh Delights,urn:epc:class:lgtin:1152917.962564.53448b49b1f...,shrimp,531,kg,2023-07-21 - 2023-07-25\nMajor Fishing Area 9\...,1152917.40988,2023-07-27,urn:epcglobal:epcis:LANDING.1152917,0,1152917.962564,1152917.40988,1152917.40988,1152917.40988.1460736,
712,Sea Aquatics,urn:epc:class:lgtin:0683543.241470.be7e4a16039...,mussels,197,kg,2023-06-08 - 2023-06-13\nMajor Fishing Area 2\...,0683543.48141,2023-06-14,urn:epcglobal:epcis:LANDING.0683543,0,0683543.241470,0683543.48141,0683543.48141,0683543.48141.9385274,
