From 6469898b93f7b66269dba0dc878adc4a79771677 Mon Sep 17 00:00:00 2001
From: Borovits <nemania.borovits@kpn.com>
Date: Wed, 24 Feb 2021 15:12:36 +0100
Subject: [PATCH] Added channel and program title to the columns to be encoded
 as categorical. Added channel and program title to be calculated in Jensen
 Shannon and KL divergence. Removed the calculation of CONTENT_ID for KL
 divergence For the selected columns in Jensen Shannon and KL divergence the
 same unique values as the orginal dataset needs to be generated (try/catch
 code)

---
 evaluation.py | 284 +++++++++++++++++++++++++++-----------------------
 1 file changed, 156 insertions(+), 128 deletions(-)

diff --git a/evaluation.py b/evaluation.py
index ef6daf3..713f34a 100644
--- a/evaluation.py
+++ b/evaluation.py
@@ -24,6 +24,8 @@ def __init__(self, origdst, synthdst):
     def to_cat(dtr, dts):
 
         target_cols = list(dtr.columns[11:-3])
+        target_cols.insert(0, dtr.columns[1])  # channel
+        target_cols.insert(0, dtr.columns[2])  # program_title
         target_cols.insert(0, dtr.columns[3])  # genre
 
         #         flag_same_demographic_column_values = True
@@ -118,17 +120,28 @@ def jensen_shannon(self):
         real_cat, synth_cat = self.to_cat(self.origdst, self.synthdst)
 
         target_columns = list(self.origdst.columns[11:-3])
+        target_columns.append(self.origdst.columns[1])  # channel
+        target_columns.append(self.origdst.columns[2])  # program_title
         target_columns.append(self.origdst.columns[3])  # genre
 
         js_dict = {}
 
         for col in target_columns:
-            col_counts_orig = real_cat[col].value_counts(normalize=True).sort_index(ascending=True)
-            col_counts_synth = synth_cat[col].value_counts(normalize=True).sort_index(ascending=True)
 
-            js = distance.jensenshannon(asarray(col_counts_orig.tolist()), asarray(col_counts_synth.tolist()), base=2)
+            try:
+                col_counts_orig = real_cat[col].value_counts(normalize=True).sort_index(ascending=True)
+                col_counts_synth = synth_cat[col].value_counts(normalize=True).sort_index(ascending=True)
 
-            js_dict[col] = js
+                js = distance.jensenshannon(asarray(col_counts_orig.tolist()), asarray(col_counts_synth.tolist()),
+                                            base=2)
+
+                js_dict[col] = js
+
+            except:
+
+                print('For the column ', col, ' you must generate the same unique values as the real dataset.')
+                print('The number of unique values than you should generate for column ', col, 'is ',
+                      len(self.origdst[col].unique()))
 
         return js_dict
 
@@ -139,17 +152,28 @@ def kl_divergence(self):
         The threshold limit for this metric is a value below 2"""
 
         target_columns = list(self.origdst.columns[11:-3])
-        target_columns.append(self.origdst.columns[4])  # content_id
+        target_columns.append(self.origdst.columns[1])  # channel
+        target_columns.append(self.origdst.columns[2])  # program_title
+        target_columns.append(self.origdst.columns[3])  # genre
 
         kl_dict = {}
 
         for col in target_columns:
-            col_counts_orig = self.origdst[col].value_counts(normalize=True).sort_index(ascending=True)
-            col_counts_synth = self.synthdst[col].value_counts(normalize=True).sort_index(ascending=True)
 
-            kl = sum(rel_entr(col_counts_orig.tolist(), col_counts_synth.tolist()))
+            try:
+
+                col_counts_orig = self.origdst[col].value_counts(normalize=True).sort_index(ascending=True)
+                col_counts_synth = self.synthdst[col].value_counts(normalize=True).sort_index(ascending=True)
+
+                kl = sum(rel_entr(col_counts_orig.tolist(), col_counts_synth.tolist()))
 
-            kl_dict[col] = kl
+                kl_dict[col] = kl
+
+            except:
+
+                print('For the column ', col, ' you must generate the same unique values as the real dataset.')
+                print('The number of unique values than you should generate for column ', col, 'is ',
+                      len(self.origdst[col].unique()))
 
         return kl_dict
 
@@ -176,123 +200,127 @@ def pairwise_correlation_difference(self):
 
         return prwcrdst, substract_m
 
-    if __name__ == "__main__":
-
-        logging.basicConfig(filename='evaluation.log',
-                            format='%(asctime)s %(message)s',
-                            filemode='w')
-
-        logger = logging.getLogger()
-        logger.setLevel(logging.INFO)
-
-        ob = eval_metrics(r, ra)
-
-        # euclidean distance
-        flag_eucl = False
-        eucl, eumatr = ob.euclidean_dist()
-        logger.info('Euclidean distance was calculated')
-        print('The calculated euclidean distance is: ', eucl)
-        print('The calculated euclidean distance matrix is:', eumatr)
-        if eucl > 14:
-            logger.error(f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \
-            less than 14. The current value is {eucl}')
-            logger.info(f'The Euclidean distance matrix is \n {eumatr}')
-        else:
-            logger.info('The dataset satisfies the criteria for the euclidean distance.')
-            logger.info(f'The calculated Euclidean distance value is \n {eucl}')
-            logger.info(f'The Euclidean distance matrix is \n {eumatr}')
-            flag_eucl = True
-        logger.info('---------------------------------------------------------')
-
-        # 2 sample Kolmogorov-Smirnov test
-        kst = ob.kolmogorov()
-
-        p_value = 0.05
-        flag_klg = False
-        logger.info('Kolmogorov-Smirnov test was performed')
-        print('The results of the Kolmogorov-Smirnov test is:', kst)
-        rejected = {}
-        for col in kst:
-            if kst[col]['p-value'] < p_value:
-                rejected[col] = kst[col]
-        if rejected:
-            logger.info('The dataset did not pass the Kolmogorov-Smirnov test')
-            logger.info(f'The columns that did not pass the test are \n {rejected}')
-            logger.info(f'The overall performance for the test is \n {kst}')
-        else:
-            logger.info('The dataset passed the Kolmogorov-Smirnov test')
-            logger.info(f'The overall performance for the test is \n {kst}')
-            flag_klg = True
-        logger.info('---------------------------------------------------------')
-
-        # Jensen-Shannon Divergence
-        dict_js = ob.jensen_shannon()
-        logger.info('Jensen-Shannon Divergence was calculated')
-        print('The result of the Jensen-Shannon Divergence is:', dict_js)
-        flag_js = False
-
-        jsd = deepcopy(dict_js)
-
-        for key in list(dict_js):
-            if (dict_js[key] < 0.50) & (key != 'CONTENT_ID'):
+
+if __name__ == "__main__":
+
+    logging.basicConfig(filename='evaluation.log',
+                        format='%(asctime)s %(message)s',
+                        filemode='w')
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    ob = eval_metrics(r, ra)
+
+    # euclidean distance
+    flag_eucl = False
+    eucl, eumatr = ob.euclidean_dist()
+    logger.info('Euclidean distance was calculated')
+    print('The calculated euclidean distance is: ', eucl)
+    print('The calculated euclidean distance matrix is:', eumatr)
+    if eucl > 14:
+        logger.error(f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \
+        less than 14. The current value is {eucl}')
+        logger.info(f'The Euclidean distance matrix is \n {eumatr}')
+    else:
+        logger.info('The dataset satisfies the criteria for the euclidean distance.')
+        logger.info(f'The calculated Euclidean distance value is \n {eucl}')
+        logger.info(f'The Euclidean distance matrix is \n {eumatr}')
+        flag_eucl = True
+    logger.info('---------------------------------------------------------')
+
+    # 2 sample Kolmogorov-Smirnov test
+    kst = ob.kolmogorov()
+
+    p_value = 0.05
+    flag_klg = False
+    logger.info('Kolmogorov-Smirnov test was performed')
+    print('The results of the Kolmogorov-Smirnov test is:', kst)
+    rejected = {}
+    for col in kst:
+        if kst[col]['p-value'] < p_value:
+            rejected[col] = kst[col]
+    if rejected:
+        logger.info('The dataset did not pass the Kolmogorov-Smirnov test')
+        logger.info(f'The columns that did not pass the test are \n {rejected}')
+        logger.info(f'The overall performance for the test is \n {kst}')
+    else:
+        logger.info('The dataset passed the Kolmogorov-Smirnov test')
+        logger.info(f'The overall performance for the test is \n {kst}')
+        flag_klg = True
+    logger.info('---------------------------------------------------------')
+
+    # Jensen-Shannon Divergence
+    dict_js = ob.jensen_shannon()
+    logger.info('Jensen-Shannon Divergence was calculated')
+    print('The result of the Jensen-Shannon Divergence is:', dict_js)
+    flag_js = False
+
+    jsd = deepcopy(dict_js)
+
+    for key in list(dict_js):
+        if (dict_js[key] < 0.50) & (key not in ['GENRE', 'PROGRAM_TITLE']):
+            del dict_js[key]
+        if key == 'GENRE':
+            if (dict_js[key] < 0.59):
                 del dict_js[key]
-            if key == 'CONTENT_ID':
-                if (dict_js[key] < 0.75):
-                    del dict_js[key]
-
-        if dict_js:
-            logger.info('The dataset did not pass the Jensen-Shannon Divergence test')
-            for key in dict_js.keys():
-                logger.info(f'The Jensen-Shannon Divergence value for the column {key} was {dict_js[key]}')
-            logger.info(f'The overall performance for each column is summarized below: \n {jsd}')
-        else:
-            logger.info('The dataset passed the Jensen-Shannon Divergence test')
-            logger.info(f'The overall performance for each column is summarized below: \n {jsd}')
-            flag_js = True
-        logger.info('---------------------------------------------------------')
-
-        # KL divergence
-        dict_kl = ob.kl_divergence()
-        logger.info('KL divergence was calculated')
-        print('The result of the KL divergence is', dict_kl)
-        flag_kl = False
-
-        kl = deepcopy(dict_kl)
-
-        for key in list(dict_kl):
-            if dict_kl[key] < 2.20:
-                del dict_kl[key]
-
-        if dict_kl:
-            logger.info('The dataset did not pass the KL divergence evaluation test')
-            for key in dict_kl.keys():
-                logger.info(f'The KL divergence value for the column {key} was {dict_kl[key]}')
-            logger.info(f'The overall for the KL divergence performance for each column is summarized below: \n {kl}')
-        else:
-            logger.info('The dataset passed the KL divergence evaluation test')
-            logger.info(f'The overall performance for the KL divergence for each column is summarized below: \n {kl}')
-            flag_kl = True
-        logger.info('---------------------------------------------------------')
-
-        # pairwise correlation difference
-        pair_corr_diff, pcd_matr = ob.pairwise_correlation_difference()
-        logger.info('Pairwise correlation difference was calculated')
-        print('The calculated Pairwise correlation difference was', pair_corr_diff)
-        print('The calculated Pairwise correlation difference matrix was', pcd_matr)
-
-        flag_pcd = False
-        if pair_corr_diff > 2.4:
-            logger.error(f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \
-            less than 14. The current value is {pair_corr_diff}')
-            logger.info(f'The Pairwise distance distance matrix is \n {pcd_matr}')
-        else:
-            logger.info('The dataaset satisfies the criteria for the Pairwise Correlation Difference.')
-            logger.info(f'The Pairwise distance distance value is \n {pair_corr_diff}')
-            logger.info(f'The Pairwise distance distance matrix is \n {pcd_matr}')
-            flag_pcd = True
-
-        if (flag_eucl & flag_js & flag_klg & flag_kl & flag_pcd):
-            logger.info('The dataaset satisfies the minimum evaluation criteria.')
-        else:
-            logger.info('The dataaset does not satisfy the minimum evaluation criteria.')
-            logger.info('Plese check the previous log messages.')
\ No newline at end of file
+        if key == 'PROGRAM_TITLE':
+            if (dict_js[key] < 0.69):
+                del dict_js[key]
+
+    if dict_js:
+        logger.info('The dataset did not pass the Jensen-Shannon Divergence test')
+        for key in dict_js.keys():
+            logger.info(f'The Jensen-Shannon Divergence value for the column {key} was {dict_js[key]}')
+        logger.info(f'The overall performance for each column is summarized below: \n {jsd}')
+    else:
+        logger.info('The dataset passed the Jensen-Shannon Divergence test')
+        logger.info(f'The overall performance for each column is summarized below: \n {jsd}')
+        flag_js = True
+    logger.info('---------------------------------------------------------')
+
+    # KL divergence
+    dict_kl = ob.kl_divergence()
+    logger.info('KL divergence was calculated')
+    print('The result of the KL divergence is', dict_kl)
+    flag_kl = False
+
+    kl = deepcopy(dict_kl)
+
+    for key in list(dict_kl):
+        if dict_kl[key] < 2.20:
+            del dict_kl[key]
+
+    if dict_kl:
+        logger.info('The dataset did not pass the KL divergence evaluation test')
+        for key in dict_kl.keys():
+            logger.info(f'The KL divergence value for the column {key} was {dict_kl[key]}')
+        logger.info(f'The overall for the KL divergence performance for each column is summarized below: \n {kl}')
+    else:
+        logger.info('The dataset passed the KL divergence evaluation test')
+        logger.info(f'The overall performance for the KL divergence for each column is summarized below: \n {kl}')
+        flag_kl = True
+    logger.info('---------------------------------------------------------')
+
+    # pairwise correlation difference
+    pair_corr_diff, pcd_matr = ob.pairwise_correlation_difference()
+    logger.info('Pairwise correlation difference was calculated')
+    print('The calculated Pairwise correlation difference was', pair_corr_diff)
+    print('The calculated Pairwise correlation difference matrix was', pcd_matr)
+
+    flag_pcd = False
+    if pair_corr_diff > 2.4:
+        logger.error(f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \
+        less than 14. The current value is {pair_corr_diff}')
+        logger.info(f'The Pairwise distance distance matrix is \n {pcd_matr}')
+    else:
+        logger.info('The dataaset satisfies the criteria for the Pairwise Correlation Difference.')
+        logger.info(f'The Pairwise distance distance value is \n {pair_corr_diff}')
+        logger.info(f'The Pairwise distance distance matrix is \n {pcd_matr}')
+        flag_pcd = True
+
+    if (flag_eucl & flag_js & flag_klg & flag_kl & flag_pcd):
+        logger.info('The dataaset satisfies the minimum evaluation criteria.')
+    else:
+        logger.info('The dataaset does not satisfy the minimum evaluation criteria.')
+        logger.info('Plese check the previous log messages.')
\ No newline at end of file