In [77]:
import os
import sys
import platform
import timeit
import numpy as np
import pandas as pd
OS = platform.system()   # Operating System 

#**Data Download**

In [None]:
os.system('git clone https://github.com/istat-methodology/Synthetic_Data.git')

0

In [None]:
os.system('pip install sdgym')
os.system('pip install matplotlib==3.1.3')

0

In [78]:
from sdv.evaluation import evaluate
from sdv.metrics.tabular import CSTest, KSTest                                      # Statistical Metrics
from sdv.metrics.tabular import BNLikelihood, BNLogLikelihood, GMLogLikelihood      # Likelihood Metrics¶
from sdv.metrics.tabular import LogisticDetection, SVCDetection                     # Detection Metrics
from sdv.metrics.tabular import MulticlassDecisionTreeClassifier                    # Machine Learning Efficacy Metrics¶
from sdv.metrics.tabular import NumericalLR 

In [79]:
pd.set_option('display.max_columns', 500) 
pd.set_option('display.max_rows', 500) 

In [80]:
orig_data_path = './Synthetic_Data/Datasets/syntetic_telephony.xlsx'
dest_data_path = './Synthetic_Data/Output/Telephony Synt Results/telephony_synth_data_generated_by_method_copulagantotal_time_2863.58_score_0.45.xlsx'

In [81]:
def explore_data(data): 
  print("\nHead of Data: \n", data.head())
  print("\nTail of Data: \n", data.tail())
  print("\nShape of Data: ", data.shape)
  print("\nInformation about Data: \n")
  try: 
    data.info()
  except: 
    pass
  print("\nTypes of Data attributes: \n")
  try: 
    data.dtypes
  except: 
    pass
  print("\nSummary of all numerical fields in the dataset: \n")
  try: 
    data.describe(include = [np.number])
  except: 
    pass
  print("\nSummary of all categorical fields in the dataset: \n")
  try: 
    data.describe(include = ['O'])
  except: 
    pass
  print("\nLoop Through Each Column and Check for nulls: \n")
  try: 
    for i in range(len(data.columns)):
        print(data.columns[i] + ": " + str(data[data.columns[i]].isna().sum()))
  except: 
    pass

def data_download(file_to_download, gdrive_code, OS, uncompress = True):
  if not os.path.exists(file_to_download):
    os.system('gdown --id "'+gdrive_code+'" --output '+file_to_download)
    if OS == "Linux" and uncompress:
        os.system('unzip -o -n "./'+file_to_download+'" -d "./"')
    return True
  else: 
    return None

In [113]:
xl_file = pd.ExcelFile(orig_data_path)
dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
orig_data = dfs['Sheet1']

In [114]:
# Renaming Columns into English
orig_data.rename(columns = {'COD_CELLA_CHIAMATA':'CELL_CALL_CODE', 
                      'DATA_CHIAMATA':'CALL_DATE',
                      'CHIAVE_NUM_CHIAMANTE':'NUM_CALLER_KEY',
                      'ORA_MIN_CHIAMATA':'TIME_MIN_CALL'}, inplace = True)

# Casting Datatypes  
orig_data.CELL_CALL_CODE=orig_data.CELL_CALL_CODE.astype("int64").astype("str")

orig_data.CALL_DATE=pd.to_datetime(orig_data.CALL_DATE, format='%Y%m%d')
orig_data.NUM_CALLER_KEY=orig_data.NUM_CALLER_KEY.astype("int64").astype("str")

orig_data.TIME_MIN_CALL=orig_data.TIME_MIN_CALL.astype("str").str.pad(width=6, side='left', fillchar='0')
orig_data.TIME_MIN_CALL =pd.to_datetime(orig_data.TIME_MIN_CALL , format='%H%M%S').dt.time.astype("str")
orig_data.TIME_MIN_CALL =pd.to_datetime(orig_data.TIME_MIN_CALL)

explore_data(orig_data)


Head of Data: 
       NUM_CALLER_KEY  CALL_DATE       TIME_MIN_CALL CELL_CALL_CODE
0  14615194667453690 2017-01-22 2022-05-19 15:49:53              0
1  14615194667453690 2017-01-29 2022-05-19 12:40:34              1
2  14615194667453690 2017-01-29 2022-05-19 12:41:19              1
3  14615194667453690 2017-01-22 2022-05-19 15:51:02              1
4  14615194667453690 2017-01-22 2022-05-19 15:51:38              1

Tail of Data: 
          NUM_CALLER_KEY  CALL_DATE       TIME_MIN_CALL CELL_CALL_CODE
9995  14615368752090240 2017-01-12 2022-05-19 12:09:10            272
9996  14615368752090240 2017-02-12 2022-05-19 18:26:01            272
9997   1508338528891430 2017-01-04 2022-05-19 09:26:27            272
9998  14615368752090240 2017-02-10 2022-05-19 15:11:58            272
9999  14615337422429130 2017-01-30 2022-05-19 09:09:03            272

Shape of Data:  (10000, 4)

Information about Data: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (

In [115]:
xl_file = pd.ExcelFile(dest_data_path)
dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
dest_data = dfs['Sheet1']

In [116]:
# Renaming Columns into English
dest_data.rename(columns = {'COD_CELLA_CHIAMATA':'CELL_CALL_CODE', 
                      'DATA_CHIAMATA':'CALL_DATE',
                      'CHIAVE_NUM_CHIAMANTE':'NUM_CALLER_KEY',
                      'ORA_MIN_CHIAMATA':'TIME_MIN_CALL'}, inplace = True)

# Removing F character from Dataframe
dest_data["CELL_CALL_CODE"] = dest_data["CELL_CALL_CODE"].str.replace("F","")
dest_data["NUM_CALLER_KEY"] = dest_data["NUM_CALLER_KEY"].str.replace("F","")

# Casting Datatypes  
dest_data.CELL_CALL_CODE=dest_data.CELL_CALL_CODE.astype("int64").astype("str")

dest_data.CALL_DATE=pd.to_datetime(dest_data.CALL_DATE, format='%Y%m%d')
dest_data.NUM_CALLER_KEY=dest_data.NUM_CALLER_KEY.astype("int64").astype("str")

dest_data.TIME_MIN_CALL=dest_data.TIME_MIN_CALL.astype("str").str.pad(width=6, side='left', fillchar='0')
dest_data.TIME_MIN_CALL =pd.to_datetime(dest_data.TIME_MIN_CALL).dt.time.astype("str")
dest_data.TIME_MIN_CALL =pd.to_datetime(dest_data.TIME_MIN_CALL)

explore_data(dest_data)


Head of Data: 
    Unnamed: 0     NUM_CALLER_KEY  CALL_DATE       TIME_MIN_CALL CELL_CALL_CODE
0           0  14615366112382500 2017-01-30 2022-05-19 15:54:19            258
1           1  14615362915496210 2017-01-14 2022-05-19 11:54:30            781
2           2  14615430050428180 2017-01-20 2022-05-19 21:25:36             18
3           3  14615339806425970 2017-01-09 2022-05-19 17:36:41            311
4           4  14615390824437770 2017-01-06 2022-05-19 13:20:43            565

Tail of Data: 
       Unnamed: 0     NUM_CALLER_KEY  CALL_DATE       TIME_MIN_CALL  \
9995        9995  14615362915496210 2017-01-04 2022-05-19 12:00:18   
9996        9996  14615224339316630 2017-01-05 2022-05-19 08:12:24   
9997        9997  14615228723636630 2017-01-02 2022-05-19 16:51:31   
9998        9998  14615362915496210 2017-01-25 2022-05-19 21:06:11   
9999        9999  14615420085225040 2017-02-06 2022-05-19 15:19:53   

     CELL_CALL_CODE  
9995            107  
9996            231  
9997 

In [None]:
dest_data.TIME_MIN_CALL =pd.to_datetime(dest_data.TIME_MIN_CALL)

In [None]:
explore_data(dest_data)


Head of Data: 
    Unnamed: 0     NUM_CALLER_KEY  CALL_DATE       TIME_MIN_CALL CELL_CALL_CODE
0           0  14615366112382500 2017-01-30 2022-05-19 15:54:19            258
1           1  14615362915496210 2017-01-14 2022-05-19 11:54:30            781
2           2  14615430050428180 2017-01-20 2022-05-19 21:25:36             18
3           3  14615339806425970 2017-01-09 2022-05-19 17:36:41            311
4           4  14615390824437770 2017-01-06 2022-05-19 13:20:43            565

Tail of Data: 
       Unnamed: 0     NUM_CALLER_KEY  CALL_DATE       TIME_MIN_CALL  \
9995        9995  14615362915496210 2017-01-04 2022-05-19 12:00:18   
9996        9996  14615224339316630 2017-01-05 2022-05-19 08:12:24   
9997        9997  14615228723636630 2017-01-02 2022-05-19 16:51:31   
9998        9998  14615362915496210 2017-01-25 2022-05-19 21:06:11   
9999        9999  14615420085225040 2017-02-06 2022-05-19 15:19:53   

     CELL_CALL_CODE  
9995            107  
9996            231  
9997 