# Reading datasets

In [1]:
import pandas as pd
import os, random

In [2]:
generated_data_filepath = './Generated Sample Datasets'

shareholders_filepath = generated_data_filepath + '/shareholders.csv'
appointments_filepath = generated_data_filepath + '/appointments.csv'
addresses_filepath = generated_data_filepath + '/addresses.csv'
pri_ssic_filepath = generated_data_filepath + '/pri_ssic.csv'
corppass_filepath = generated_data_filepath + '/corppass.csv'

In [66]:
# Generate initial list of relationships
entities_list = []
shareholders_list = []
appointments_list = []
appointments_namelist = []
list_of_appointments = ['Director', 'Deputy Director', 'Finance Manager', 'HR Manager', 'Project Lead', 'CTO', 'Sales Manager']
addresses_list = []
pri_ssic_list = []
corppass_list = []

### Select number of entities to be created ###
num_entities = 20

for i in range(num_entities + 1):
    entities_list.append('en' + str(i))
    shareholders_list.append('sh' + str(i))   
    appointments_list.append('app' + str(i)) 
    appointments_namelist.append(random.choice(list_of_appointments))
    addresses_list.append('add' + str(i))   
    pri_ssic_list.append('ssic' + str(i))   
    corppass_list.append('cp' + str(i))   

In [75]:
# Function to generate sample data
def generate_data(input_dict, num_rs, num_cycles, cycle_size):
    
    full_df = pd.DataFrame(input_dict, columns = input_dict.keys())
    # Generate random relationships across default relationships (as generated above)
    for i in range(num_rs + 1):
        temp_dict = {}
        for key in input_dict.keys():
            temp_dict.update({ key: random.choice(input_dict.get(key)) })
        temp_df = pd.DataFrame([temp_dict])
        full_df = full_df.append(temp_df, sort=False).reset_index().drop(['index'], axis=1)
    
    # Generate cycles
    for i in range(num_cycles):    
        if list(full_df.iloc[:, 2:]):
            print('3 vars')
            list1, list2, list3 = enforce_cycles(list(full_df[list(input_dict.keys())[0]]), 
                                                 list(full_df[list(input_dict.keys())[1]]), 
                                                 cycle_size, 
                                                 list(full_df[list(input_dict.keys())[2]]))
            new_df = pd.DataFrame({list(input_dict.keys())[0]: list1, 
                                    list(input_dict.keys())[1]: list2,
                                    list(input_dict.keys())[2]: list3}) 
        else:
            print('2 vars')
            list1, list2, list3 = enforce_cycles(list(full_df.iloc[:, 0]), 
                                                 list(full_df.iloc[:, 1]), 
                                                 cycle_size)
            new_df = pd.DataFrame({list(input_dict.keys())[0]: list1, 
                                    list(input_dict.keys())[1]: list2})
        full_df = pd.concat([full_df, new_df]).reset_index().drop('index', axis=1)
    return full_df

# Function to generate cycles in data
def enforce_cycles(entities_list, rs_list, size=3, *namelist): # rs_list : Shareholder UEN, Addresses etc
    list1, list2, list3 = [], [], []
    middle_node = ''
    start_node = random.choice(entities_list)
    
    namelist = namelist[0] if namelist else namelist # Get namelist from default tuple
    
    # Iterate through the number of expected nodes in cycle and add to list. E.g. en5 --> sh6, sh6 --> en5
    for i in range(size-1):
        while not middle_node or middle_node in list2:
            middle_node = random.choice(rs_list)
        if namelist:
            name = namelist[rs_list.index(middle_node)]
            list3.append(name)
        list1.append(start_node)
        list2.append(middle_node)
        start_node = middle_node

    # Adds the last row of relationship to create the cycle
    list1.append(start_node)
    temp = list1[0]
    list2.append(temp)
    if list3:
        list3.append(random.choice(namelist)) # Randomly assigns new Appointment to the other direction's relationship

    return list1, list2, list3


In [76]:
enforce_cycles(entities_list, appointments_list, 5, appointments_namelist)

(['en14', 'app3', 'app20', 'app11', 'app13'],
 ['app3', 'app20', 'app11', 'app13', 'en14'],
 ['Sales Manager', 'HR Manager', 'Project Lead', 'CTO', 'Deputy Director'])

In [77]:
# Generate Appointments data
appointments_df = generate_data({'Entity UEN': entities_list, 
                                 'Appointment UEN': appointments_list, 
                                 'Appointment Name':appointments_namelist},
                               10, 3, 5) # num_rs, num_cycles, cycle_size
appointments_df

3 vars
3 vars
3 vars


Unnamed: 0,Entity UEN,Appointment UEN,Appointment Name
0,en0,app0,Director
1,en1,app1,Finance Manager
2,en2,app2,HR Manager
3,en3,app3,Sales Manager
4,en4,app4,Sales Manager
5,en5,app5,Deputy Director
6,en6,app6,Finance Manager
7,en7,app7,HR Manager
8,en8,app8,HR Manager
9,en9,app9,HR Manager


In [78]:
# Generate Shareholders data
shareholders_df = generate_data({'Entity UEN': entities_list, 
                                 'Shareholder UEN': shareholders_list},
                               10, 4, 5)
shareholders_df

2 vars
2 vars
2 vars
2 vars


Unnamed: 0,Entity UEN,Shareholder UEN
0,en0,sh0
1,en1,sh1
2,en2,sh2
3,en3,sh3
4,en4,sh4
5,en5,sh5
6,en6,sh6
7,en7,sh7
8,en8,sh8
9,en9,sh9


In [79]:
# Generate addresses data
addresses_df = generate_data({'Entity UEN': entities_list, 
                              'Address': addresses_list},
                            10, 3, 5)
addresses_df

2 vars
2 vars
2 vars


Unnamed: 0,Entity UEN,Address
0,en0,add0
1,en1,add1
2,en2,add2
3,en3,add3
4,en4,add4
5,en5,add5
6,en6,add6
7,en7,add7
8,en8,add8
9,en9,add9


In [80]:
# Generate primary SSIC data
pri_ssic_df = generate_data({'Entity UEN': entities_list, 
                             'Primary SSIC': pri_ssic_list},
                           10, 3, 5)
pri_ssic_df

2 vars
2 vars
2 vars


Unnamed: 0,Entity UEN,Primary SSIC
0,en0,ssic0
1,en1,ssic1
2,en2,ssic2
3,en3,ssic3
4,en4,ssic4
5,en5,ssic5
6,en6,ssic6
7,en7,ssic7
8,en8,ssic8
9,en9,ssic9


In [85]:
# Generate corppass data
corppass_df = generate_data({'Entity UEN': entities_list, 
                             'CorpPass': corppass_list},
                           10, 3, 5)
corppass_df

2 vars
2 vars
2 vars


Unnamed: 0,Entity UEN,CorpPass
0,en0,cp0
1,en1,cp1
2,en2,cp2
3,en3,cp3
4,en4,cp4
5,en5,cp5
6,en6,cp6
7,en7,cp7
8,en8,cp8
9,en9,cp9


In [86]:
# # Creates datasets folder
# if generated_data_filepath.split('/')[1] in os.listdir():
#     print(True)
# else:
#     os.mkdir(generated_data_filepath)
    
# # Output files to csv

# shareholders_df.to_csv(shareholders_filepath, index = False)
# appointments_df.to_csv(appointments_filepath, index = False)
# addresses_df.to_csv(addresses_filepath, index = False)
# pri_ssic_df.to_csv(pri_ssic_filepath, index = False)
# corppass_df.to_csv(corppass_filepath, index = False)

True
