In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import pairwise_distances
from sklearn.manifold import MDS, TSNE
import joblib
from sklearn.neighbors import KernelDensity
from matplotlib.colors import LogNorm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv("tutajDaneRzeczywiste", sep=';')

df = df.sample(n=300, replace=True)

def map_values(df, columns_mappings):

    df_new = df.copy()
    for column, mapping in columns_mappings.items():
        df_new[column] = df_new[column].replace(mapping)
    
    return df_new


def scale_to_zero_one(df, scaler):
    return pd.DataFrame(scaler.transform(df), columns=df.columns)


def back_to_original_scale(df, scaler):
    return pd.DataFrame(scaler.inverse_transform(df), columns=df.columns)


def to_int(df, columns):

    df_new = df.copy()
    for col in columns:
        df_new[col] = df_new[col].astype('int32')
    
    return df_new

In [None]:
df['Imię'] = df['Imię i Nazwisko'].apply(lambda x: x.split(' ')[0])
df['Nazwisko'] = df['Imię i Nazwisko'].apply(lambda x: x.split(' ')[1])



df.drop('Imię i Nazwisko', axis=1, inplace=True)
df.drop('Sygnatura czasowa', axis=1, inplace=True)
df.drop('E-mail', axis=1, inplace=True)

df["Stanowisko *"], stanowisko_mapping = pd.factorize(df["Stanowisko *"])
df["Imię"], imie_mapping = pd.factorize(df["Imię"])
df["Nazwisko"], nazwisko_mapping = pd.factorize(df["Nazwisko"])

stanowisko_mapping_dict = dict(enumerate(stanowisko_mapping))
imie_mapping_dict = dict(enumerate(imie_mapping))
nazwisko_mapping_dict = dict(enumerate(nazwisko_mapping))


df_dataset = (
    df.
    pipe(to_int, columns = [
        'Wiek *'
])
)
df_dataset_num = (
    df_dataset
    .pipe(map_values, columns_mappings={
        'Płeć': {'K': 1, 'M': 0}
        })
)

scaler = MinMaxScaler().fit(df_dataset_num)
df_dataset_scaled = scale_to_zero_one(df_dataset_num, scaler)
df_dataset_scaled_back = back_to_original_scale(df_dataset_scaled, scaler)

df['Stanowisko_original'] = df['Stanowisko *'].map(lambda x: stanowisko_mapping_dict[x])
df['Imię_original'] = df['Imię'].map(lambda x: imie_mapping_dict[x])
df['Nazwisko_original'] = df['Nazwisko'].map(lambda x: nazwisko_mapping_dict[x])

display(df_dataset_scaled_back.sample(20))

In [None]:
Dist_matrix = pairwise_distances(df_dataset_scaled, metric='cosine')


projected_data = MDS(n_components=2,
                        dissimilarity='precomputed',
                        normalized_stress='auto').fit_transform(Dist_matrix)

df_dataset_latent = pd.DataFrame(projected_data, columns=['z1', 'z2'])
Dist_matrix_proj = pairwise_distances(df_dataset_latent, metric='cosine')

print()

columns_list = [
    'Imię', 'Nazwisko',
    'Wiek *', 'Stanowisko *',
    'Płeć'
]

df_dataset_latent

In [None]:
from sklearn.neighbors import KernelDensity

kde = KernelDensity(kernel='gaussian', bandwidth=0.008).fit(df_dataset_latent)

joblib.dump(kde, 'kde.joblib')

df_samples_latent = pd.DataFrame(kde.sample(len(df_dataset_scaled)), columns=df_dataset_latent.columns)

In [None]:
def make_decoder(model, features, target, display=None):
    
    (features_train, features_test,
     target_train, target_test) = train_test_split(features, target, test_size=0.8)

    model.fit(features_train, target_train)
    
    if display == 'cont':
        pred_train = model.predict(features_train)
        pred_test = model.predict(features_test)

        fig, ax = plt.subplots(1,2, figsize=(6,3), sharey=True)
        ax[0].scatter(target_train, pred_train, alpha=1, s=8, label='training data')
        ax[0].axis('square')
        ax[0].plot([0, 1], [0, 1], color='black', label='ideal')
        ax[0].set_xlim([0,1])
        ax[0].set_ylim([0,1])
        ax[0].set_xlabel('target')
        ax[0].set_ylabel('prediction')
        ax[0].legend()
        ax[1].scatter(target_test, pred_test, alpha=0.5, s=8, label='testing data')
        ax[1].axis('square')
        ax[1].plot([0, 1], [0, 1], color='black', label='ideal')
        ax[1].set_xlim([0,1])
        ax[1].set_ylim([0,1])
        ax[1].set_xlabel('target')
        fig.suptitle(target.name)
        ax[1].legend()
        plt.show()
        
    elif display == 'cat':
        pred_train = model.predict(features_train)
        pred_test = model.predict(features_test)

        fig, ax = plt.subplots(1,1)
        cm = confusion_matrix(target_test, pred_test, labels=np.unique(target_test))
        disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                      display_labels=np.unique(target_test))
        ax.set_title(target.name)
        disp.plot(ax=ax)
    
    return model


def make_decoders(df_latent, df_target, cont_columns, cat_columns, display=False):

    decoders = {}

    for col in cont_columns:
        target = df_target[col]
        a = df_target[col].to_list()
        decoders[col] = make_decoder(
            model=RandomForestRegressor(),
            features=df_latent,
            target=target,
            display='cont' if display else None
            )
        
    for col in cat_columns:
        target = df_target[col]
        f = target.to_list()
        decoders[col] = make_decoder(
            model=SVC(),
            features=df_latent,
            target= round(target),
            display='cat' if display else None
            )
        
        print(f"target: {col} -> ", target)

    return decoders

a = make_decoders(
    df_latent=df_dataset_latent, 
    df_target=df_dataset_scaled, 
    cont_columns=['Wiek *','Imię', 'Nazwisko',  'Stanowisko *'],
    cat_columns= ['Płeć'],
    display=True
)


for key, value in a.items():
    joblib.dump(value, 'TrainedModels/'+f'{key.split(' ')[0]}_{value.__class__.__name__}_trained.joblib')

In [None]:
def df_to_rows(df):
    return [pd.DataFrame(row).T for index, row in df.iterrows()]

def decode_sample(sample):
    return {col: decoder.predict(sample) for col, decoder in a.items()}

samples_latent = df_to_rows(df_samples_latent)
df_decoded_samples = pd.DataFrame([decode_sample(sample) for sample in samples_latent], columns=df_dataset.columns)

df_synthetic_raw = back_to_original_scale(df_decoded_samples, scaler)

def round_to_2sd(df, columns):

    df_new = df.copy()
    for col in columns:
        df_new[col] = df_new[col].apply(lambda value: round(value, 2))
    
    return df_new


def round_to_1sd(df, columns):

    df_new = df.copy()
    for col in columns:
        df_new[col] = df_new[col].apply(lambda value: round(value, 1))
    
    return df_new


df_synthetic_num = (
    df_synthetic_raw.
    pipe(to_int, columns=[
        'Wiek *'])
)

df_synthetic = (
    df_synthetic_num.
    pipe(map_values, columns_mappings={
        'Płeć': {1: 'Female', 0: 'Male'},
        })
)

df['Stanowisko_original'] = df['Stanowisko *'].map(lambda x: stanowisko_mapping_dict[x])
df['Imię_original'] = df['Imię'].map(lambda x: imie_mapping_dict[x])
df['Nazwisko_original'] = df['Nazwisko'].map(lambda x: nazwisko_mapping_dict[x])


df_synthetic_raw_scaled = df_synthetic_raw.copy()

columns_to_scale = ['Płeć','Stanowisko *', 'Imię', 'Nazwisko']


def min_max_scaling(column):
    return ((column - column.min()) / (column.max() - column.min()) * column.max()).round().astype(int)

df_synthetic_raw_scaled[columns_to_scale] = df_synthetic_raw_scaled[columns_to_scale].apply(min_max_scaling)

df_synthetic_raw_scaled

df_syntehtic_mapped = df_synthetic_raw.copy()



df_syntehtic_mapped['Stanowisko *'] = df_synthetic_raw_scaled['Stanowisko *'].map(lambda x: stanowisko_mapping_dict[x])
df_syntehtic_mapped['Imię'] = df_synthetic_raw_scaled['Imię'].map(lambda x: imie_mapping_dict[x])
df_syntehtic_mapped['Nazwisko'] = df_synthetic_raw_scaled['Nazwisko'].map(lambda x: nazwisko_mapping_dict[x])

df_syntehtic_mapped['Wiek *'] = df_syntehtic_mapped['Wiek *'].map(lambda x: int(round(x)))
df_syntehtic_mapped['Płeć'] = df_syntehtic_mapped['Wiek *'].map(lambda x: int(round(x)))


df_syntehtic_mapped



In [None]:
from table_evaluator import load_data, TableEvaluator
table_evaluator = TableEvaluator(df_dataset, df_synthetic)



table_evaluator.visual_evaluation()