diff --git a/evaluation.py b/evaluation.py index ef6daf3..713f34a 100644 --- a/evaluation.py +++ b/evaluation.py @@ -24,6 +24,8 @@ def __init__(self, origdst, synthdst): def to_cat(dtr, dts): target_cols = list(dtr.columns[11:-3]) + target_cols.insert(0, dtr.columns[1]) # channel + target_cols.insert(0, dtr.columns[2]) # program_title target_cols.insert(0, dtr.columns[3]) # genre # flag_same_demographic_column_values = True @@ -118,17 +120,28 @@ def jensen_shannon(self): real_cat, synth_cat = self.to_cat(self.origdst, self.synthdst) target_columns = list(self.origdst.columns[11:-3]) + target_columns.append(self.origdst.columns[1]) # channel + target_columns.append(self.origdst.columns[2]) # program_title target_columns.append(self.origdst.columns[3]) # genre js_dict = {} for col in target_columns: - col_counts_orig = real_cat[col].value_counts(normalize=True).sort_index(ascending=True) - col_counts_synth = synth_cat[col].value_counts(normalize=True).sort_index(ascending=True) - js = distance.jensenshannon(asarray(col_counts_orig.tolist()), asarray(col_counts_synth.tolist()), base=2) + try: + col_counts_orig = real_cat[col].value_counts(normalize=True).sort_index(ascending=True) + col_counts_synth = synth_cat[col].value_counts(normalize=True).sort_index(ascending=True) - js_dict[col] = js + js = distance.jensenshannon(asarray(col_counts_orig.tolist()), asarray(col_counts_synth.tolist()), + base=2) + + js_dict[col] = js + + except: + + print('For the column ', col, ' you must generate the same unique values as the real dataset.') + print('The number of unique values than you should generate for column ', col, 'is ', + len(self.origdst[col].unique())) return js_dict @@ -139,17 +152,28 @@ def kl_divergence(self): The threshold limit for this metric is a value below 2""" target_columns = list(self.origdst.columns[11:-3]) - target_columns.append(self.origdst.columns[4]) # content_id + target_columns.append(self.origdst.columns[1]) # channel + target_columns.append(self.origdst.columns[2]) # program_title + target_columns.append(self.origdst.columns[3]) # genre kl_dict = {} for col in target_columns: - col_counts_orig = self.origdst[col].value_counts(normalize=True).sort_index(ascending=True) - col_counts_synth = self.synthdst[col].value_counts(normalize=True).sort_index(ascending=True) - kl = sum(rel_entr(col_counts_orig.tolist(), col_counts_synth.tolist())) + try: + + col_counts_orig = self.origdst[col].value_counts(normalize=True).sort_index(ascending=True) + col_counts_synth = self.synthdst[col].value_counts(normalize=True).sort_index(ascending=True) + + kl = sum(rel_entr(col_counts_orig.tolist(), col_counts_synth.tolist())) - kl_dict[col] = kl + kl_dict[col] = kl + + except: + + print('For the column ', col, ' you must generate the same unique values as the real dataset.') + print('The number of unique values than you should generate for column ', col, 'is ', + len(self.origdst[col].unique())) return kl_dict @@ -176,123 +200,127 @@ def pairwise_correlation_difference(self): return prwcrdst, substract_m - if __name__ == "__main__": - - logging.basicConfig(filename='evaluation.log', - format='%(asctime)s %(message)s', - filemode='w') - - logger = logging.getLogger() - logger.setLevel(logging.INFO) - - ob = eval_metrics(r, ra) - - # euclidean distance - flag_eucl = False - eucl, eumatr = ob.euclidean_dist() - logger.info('Euclidean distance was calculated') - print('The calculated euclidean distance is: ', eucl) - print('The calculated euclidean distance matrix is:', eumatr) - if eucl > 14: - logger.error(f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \ - less than 14. The current value is {eucl}') - logger.info(f'The Euclidean distance matrix is \n {eumatr}') - else: - logger.info('The dataset satisfies the criteria for the euclidean distance.') - logger.info(f'The calculated Euclidean distance value is \n {eucl}') - logger.info(f'The Euclidean distance matrix is \n {eumatr}') - flag_eucl = True - logger.info('---------------------------------------------------------') - - # 2 sample Kolmogorov-Smirnov test - kst = ob.kolmogorov() - - p_value = 0.05 - flag_klg = False - logger.info('Kolmogorov-Smirnov test was performed') - print('The results of the Kolmogorov-Smirnov test is:', kst) - rejected = {} - for col in kst: - if kst[col]['p-value'] < p_value: - rejected[col] = kst[col] - if rejected: - logger.info('The dataset did not pass the Kolmogorov-Smirnov test') - logger.info(f'The columns that did not pass the test are \n {rejected}') - logger.info(f'The overall performance for the test is \n {kst}') - else: - logger.info('The dataset passed the Kolmogorov-Smirnov test') - logger.info(f'The overall performance for the test is \n {kst}') - flag_klg = True - logger.info('---------------------------------------------------------') - - # Jensen-Shannon Divergence - dict_js = ob.jensen_shannon() - logger.info('Jensen-Shannon Divergence was calculated') - print('The result of the Jensen-Shannon Divergence is:', dict_js) - flag_js = False - - jsd = deepcopy(dict_js) - - for key in list(dict_js): - if (dict_js[key] < 0.50) & (key != 'CONTENT_ID'): + +if __name__ == "__main__": + + logging.basicConfig(filename='evaluation.log', + format='%(asctime)s %(message)s', + filemode='w') + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + + ob = eval_metrics(r, ra) + + # euclidean distance + flag_eucl = False + eucl, eumatr = ob.euclidean_dist() + logger.info('Euclidean distance was calculated') + print('The calculated euclidean distance is: ', eucl) + print('The calculated euclidean distance matrix is:', eumatr) + if eucl > 14: + logger.error(f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \ + less than 14. The current value is {eucl}') + logger.info(f'The Euclidean distance matrix is \n {eumatr}') + else: + logger.info('The dataset satisfies the criteria for the euclidean distance.') + logger.info(f'The calculated Euclidean distance value is \n {eucl}') + logger.info(f'The Euclidean distance matrix is \n {eumatr}') + flag_eucl = True + logger.info('---------------------------------------------------------') + + # 2 sample Kolmogorov-Smirnov test + kst = ob.kolmogorov() + + p_value = 0.05 + flag_klg = False + logger.info('Kolmogorov-Smirnov test was performed') + print('The results of the Kolmogorov-Smirnov test is:', kst) + rejected = {} + for col in kst: + if kst[col]['p-value'] < p_value: + rejected[col] = kst[col] + if rejected: + logger.info('The dataset did not pass the Kolmogorov-Smirnov test') + logger.info(f'The columns that did not pass the test are \n {rejected}') + logger.info(f'The overall performance for the test is \n {kst}') + else: + logger.info('The dataset passed the Kolmogorov-Smirnov test') + logger.info(f'The overall performance for the test is \n {kst}') + flag_klg = True + logger.info('---------------------------------------------------------') + + # Jensen-Shannon Divergence + dict_js = ob.jensen_shannon() + logger.info('Jensen-Shannon Divergence was calculated') + print('The result of the Jensen-Shannon Divergence is:', dict_js) + flag_js = False + + jsd = deepcopy(dict_js) + + for key in list(dict_js): + if (dict_js[key] < 0.50) & (key not in ['GENRE', 'PROGRAM_TITLE']): + del dict_js[key] + if key == 'GENRE': + if (dict_js[key] < 0.59): del dict_js[key] - if key == 'CONTENT_ID': - if (dict_js[key] < 0.75): - del dict_js[key] - - if dict_js: - logger.info('The dataset did not pass the Jensen-Shannon Divergence test') - for key in dict_js.keys(): - logger.info(f'The Jensen-Shannon Divergence value for the column {key} was {dict_js[key]}') - logger.info(f'The overall performance for each column is summarized below: \n {jsd}') - else: - logger.info('The dataset passed the Jensen-Shannon Divergence test') - logger.info(f'The overall performance for each column is summarized below: \n {jsd}') - flag_js = True - logger.info('---------------------------------------------------------') - - # KL divergence - dict_kl = ob.kl_divergence() - logger.info('KL divergence was calculated') - print('The result of the KL divergence is', dict_kl) - flag_kl = False - - kl = deepcopy(dict_kl) - - for key in list(dict_kl): - if dict_kl[key] < 2.20: - del dict_kl[key] - - if dict_kl: - logger.info('The dataset did not pass the KL divergence evaluation test') - for key in dict_kl.keys(): - logger.info(f'The KL divergence value for the column {key} was {dict_kl[key]}') - logger.info(f'The overall for the KL divergence performance for each column is summarized below: \n {kl}') - else: - logger.info('The dataset passed the KL divergence evaluation test') - logger.info(f'The overall performance for the KL divergence for each column is summarized below: \n {kl}') - flag_kl = True - logger.info('---------------------------------------------------------') - - # pairwise correlation difference - pair_corr_diff, pcd_matr = ob.pairwise_correlation_difference() - logger.info('Pairwise correlation difference was calculated') - print('The calculated Pairwise correlation difference was', pair_corr_diff) - print('The calculated Pairwise correlation difference matrix was', pcd_matr) - - flag_pcd = False - if pair_corr_diff > 2.4: - logger.error(f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \ - less than 14. The current value is {pair_corr_diff}') - logger.info(f'The Pairwise distance distance matrix is \n {pcd_matr}') - else: - logger.info('The dataaset satisfies the criteria for the Pairwise Correlation Difference.') - logger.info(f'The Pairwise distance distance value is \n {pair_corr_diff}') - logger.info(f'The Pairwise distance distance matrix is \n {pcd_matr}') - flag_pcd = True - - if (flag_eucl & flag_js & flag_klg & flag_kl & flag_pcd): - logger.info('The dataaset satisfies the minimum evaluation criteria.') - else: - logger.info('The dataaset does not satisfy the minimum evaluation criteria.') - logger.info('Plese check the previous log messages.') \ No newline at end of file + if key == 'PROGRAM_TITLE': + if (dict_js[key] < 0.69): + del dict_js[key] + + if dict_js: + logger.info('The dataset did not pass the Jensen-Shannon Divergence test') + for key in dict_js.keys(): + logger.info(f'The Jensen-Shannon Divergence value for the column {key} was {dict_js[key]}') + logger.info(f'The overall performance for each column is summarized below: \n {jsd}') + else: + logger.info('The dataset passed the Jensen-Shannon Divergence test') + logger.info(f'The overall performance for each column is summarized below: \n {jsd}') + flag_js = True + logger.info('---------------------------------------------------------') + + # KL divergence + dict_kl = ob.kl_divergence() + logger.info('KL divergence was calculated') + print('The result of the KL divergence is', dict_kl) + flag_kl = False + + kl = deepcopy(dict_kl) + + for key in list(dict_kl): + if dict_kl[key] < 2.20: + del dict_kl[key] + + if dict_kl: + logger.info('The dataset did not pass the KL divergence evaluation test') + for key in dict_kl.keys(): + logger.info(f'The KL divergence value for the column {key} was {dict_kl[key]}') + logger.info(f'The overall for the KL divergence performance for each column is summarized below: \n {kl}') + else: + logger.info('The dataset passed the KL divergence evaluation test') + logger.info(f'The overall performance for the KL divergence for each column is summarized below: \n {kl}') + flag_kl = True + logger.info('---------------------------------------------------------') + + # pairwise correlation difference + pair_corr_diff, pcd_matr = ob.pairwise_correlation_difference() + logger.info('Pairwise correlation difference was calculated') + print('The calculated Pairwise correlation difference was', pair_corr_diff) + print('The calculated Pairwise correlation difference matrix was', pcd_matr) + + flag_pcd = False + if pair_corr_diff > 2.4: + logger.error(f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \ + less than 14. The current value is {pair_corr_diff}') + logger.info(f'The Pairwise distance distance matrix is \n {pcd_matr}') + else: + logger.info('The dataaset satisfies the criteria for the Pairwise Correlation Difference.') + logger.info(f'The Pairwise distance distance value is \n {pair_corr_diff}') + logger.info(f'The Pairwise distance distance matrix is \n {pcd_matr}') + flag_pcd = True + + if (flag_eucl & flag_js & flag_klg & flag_kl & flag_pcd): + logger.info('The dataaset satisfies the minimum evaluation criteria.') + else: + logger.info('The dataaset does not satisfy the minimum evaluation criteria.') + logger.info('Plese check the previous log messages.') \ No newline at end of file