# Code to generate the agent model

In [1]:
import pandas as pd
import numpy as np
from distfit import distfit
import math
import pickle
import os

In [None]:
PATH_DIR = './'
os.chdir(PATH_DIR)

In [2]:
# Datasets with the clustered users
PATH = 'Data/Original dataset 11-2022/'
FILE_NAME = 'clustered_users_'

df = {}

for i in range(4):
    df[str(i)] = pd.read_excel(PATH+FILE_NAME+str(i)+'.xlsx')

# Dataset with the selected distributions for each attribute
PATH = 'Data/'
FILE_NAME = 'selected_distributions.xlsx'

ds_distr = pd.read_excel(PATH+FILE_NAME)
ds_distr = ds_distr.set_index(['CLUSTER','ATTRIBUTE'])

In [3]:
def find_distribution(dataset, distr=None, bins='auto', method=0):

    # Initialize distfit
    if (method==0):
        dist = distfit(distr=distr, bins=bins)
    else:   # binary
        dist = distfit(method=method)

    # Determine best-fitting probability distribution for data
    dist.fit_transform(dataset, verbose=0)

    return dist

In [4]:
int_attributes = ['friends_count', 'followers_count', 'listed_count', 'favourites_count', 'statuses_count', 'created_at']
binary_attributes = ['verified', 'location', 'protected', 'geo_enabled', 'default_profile', 'default_profile_image']

## Generating the model

In [7]:
model_dict = {}

# integer attributes
for cluster, dataset in df.items():
    model_dict[cluster] = {}
    for x in int_attributes:
        data = dataset[x]
        ds = ds_distr.loc[int(cluster), x]

        model = find_distribution(data, distr=ds['distr'])

        model_dict[cluster][x] = model
    # binary attributes
    for x in binary_attributes:
        data = dataset[x]
        ds = ds_distr.loc[int(cluster), x]
        n = ds['n']

        if (math.isnan(n)==False):
            p = 1
        else:
            model = find_distribution(data, method='discrete')
            n = model.model['n']
            p = model.model['p']
        
        model_dict[cluster][x] = {'n':n, 'p':p}

In [8]:
# Save the model
PIK = "model_final.dat"

with open(PIK, "wb") as f:
    pickle.dump(model_dict, f)

## Generating new data based on the model

In [9]:
def generate_samples(model, n_samples=1000):
    Xgenerate = model.generate(n=n_samples)
    return Xgenerate

def generate_binary_samples(n, p, n_samples=1000):
    return np.random.binomial(n, p, n_samples)

In [4]:
# Generated model
PATH = "D:/TFG/TFG Code/Datasets/Datasets/New generated users/"
PIK = "model_final.dat"

with open(PATH+PIK, "rb") as f:
    model_dict = pickle.load(f)

In [10]:
generated_data = {}
total_samples = 100000
percentages = {'0': 0.04653327128897162 / 100, '1': 87.79989167664716 / 100, '2': 1.3532790699448465 / 100, '3': 10.800295982119017 / 100}

#for cluster in df.keys():
for cluster in model_dict.keys():
    generated_data[cluster] = {}
    n_samples = int(total_samples * percentages[cluster])

    ids = [cluster+'x'+str(i) for i in range(n_samples)]

    generated_data[cluster]['id'] = np.array(ids)
    
    # integer attributes
    for x in int_attributes[:-1]:
        model = model_dict[cluster][x]

        new_data = generate_samples(model, n_samples=n_samples)

        new_data = new_data[new_data >= 0]
        
        while (len(new_data) < n_samples):
            aux_data = generate_samples(model, n_samples=n_samples-len(new_data))
            aux_data = aux_data[aux_data >= 0]
            new_data = np.concatenate((new_data, aux_data))
        
        new_data = np.rint(new_data)

        generated_data[cluster][x] = new_data

    # created_at
    for x in [int_attributes[-1]]:
        model = model_dict[cluster][x]

        new_data = generate_samples(model, n_samples=n_samples)

        new_data = new_data[(new_data >= 2006) & (new_data <= 2023)]
        
        while (len(new_data) < n_samples):
            aux_data = generate_samples(model, n_samples=n_samples-len(new_data))
            aux_data = aux_data[(aux_data >= 2006) & (aux_data <= 2023)]
            new_data = np.concatenate((new_data, aux_data))
        
        new_data = new_data.astype(np.int64)

        generated_data[cluster][x] = new_data
            
    # binary attributes
    for x in binary_attributes:
        model = model_dict[cluster][x]
        
        new_data = generate_binary_samples(model['n'], model['p'], n_samples=n_samples)
        
        generated_data[cluster][x] = new_data

In [12]:
# Save the synthetic users
PATH = 'Data/Generated users/'
FILE_NAME = 'generated_users_'

for cluster in generated_data.keys():
    generated_cluster = pd.DataFrame(generated_data[cluster])
    generated_cluster.to_excel(PATH+FILE_NAME+cluster+'.xlsx', engine='xlsxwriter')

In [13]:
PATH = 'Data/Generated users/'
FILE_NAME = 'user_model'

with open(PATH+FILE_NAME+'.dat', "wb") as f:
    pickle.dump(generated_data, f)