# Count the synthetic data generated by Stefan

In [1]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import matplotlib.pyplot as plt

from src import Flag, AbstractTask, AbstractFlag, Pipeline

import warnings
warnings.filterwarnings("ignore")

In [10]:
dataPath = r"../data/csv/"
picklePath = r"../pickle_objects/modeling_pipeline/"
plotPath = r"../plots/modeling/"
tablePath = r"../tables/modeling/"
modelPath = r"../pickle_objects/models_and_scalers/"

In [11]:
import pickle
with open(picklePath + 'Modeling Pipeline.pickle', 'rb') as f:
    ModelingPipeline = pickle.load(f)

ModelingPipeline.print(priority = True, flags = True)

Modeling Pipeline
	10 Pre Processing: Pre Processing
		10 Select and order columns
		20 Company ID and UUID
			10 Fill Company ID with None
			20 Fill UUID with None
			30 Replace Company IDs
			40 Replace Company UUIDs
		30 MS Data Processing
			10 Fill MS Data with empty String
			20 Length of MS Data
			30 Continuity Type
			40 Drop MS Data Column
		40 Service Data Processing
			10 Fill Service Data with empty String
			20 Length of Service Data
			30 Samsung Type
			40 Drop Service Data Column
		50 Fill Numeric NA with 0
		60 Fill String NA with None
		70 Datetime conversion
		80 Clean PDU
		90 Order DataFrame
		100 Convert object type to string
	20 Dummy Processing: Dummy Processing
		10 Dummies Channel
		20 Dummies AD Type
		30 Dummies Company
		40 Dummies UUID
		50 Dummies PDU Type
		60 Dummies Continuity Type
		70 Dummies SmartTag Type
	30 Labeling: Labeling
		10 Labeling auto: Labeling auto
			Label FindMy Tracker and iDevice: Label FindMy Tracker and iDevice
			Label Tile and

In [12]:
with open(picklePath + 'Pre Processing.pickle', 'rb') as f:
    flag_preProcessing = pickle.load(f)

with open(picklePath +  'Modeling.pickle', 'rb') as f:
    flag_modeling = pickle.load(f)

with open(picklePath + 'Dummy Processing.pickle', 'rb') as f:
    flag_dummy= pickle.load(f)

with open(picklePath + 'Drop Columns.pickle', 'rb') as f:
    flag_dropColumns= pickle.load(f)

with open(picklePath + 'Label SmartTag and Other.pickle', 'rb') as f:
    flag_label_SmartTag_and_Other = pickle.load(f)

with open(picklePath + 'State nearby.pickle', 'rb') as f:
    flag_state_nearby = pickle.load(f)

with open(picklePath + 'Drop other Device.pickle', 'rb') as f:
    flag_drop_label_Other = pickle.load(f)

with open(picklePath + 'Label other Device.pickle', 'rb') as f:
    flag_label_other_Device = pickle.load(f)

In [13]:
def filesToDataFrame(filesDict: dict[str:AbstractFlag], filePath:str = dataPath, config:AbstractTask = ModelingPipeline, ) -> pd.DataFrame:
    dataset = pd.DataFrame()

    for f in filesDict.keys():
        new_dataset = Pipeline().setPath(filePath + f).setTask(config).loadData().run(flag=filesDict[f])
        dataset = pd.concat([dataset, new_dataset])

    dataset.reset_index(drop = True, inplace = True)

    return dataset

def prepareDataset(dataset:pd.DataFrame,  balance:bool = False, shuffle:bool = False, fraction:int = 1, labelColumn:str='Label') -> pd.DataFrame:
    if balance:
        min_count = dataset.groupby(labelColumn).size().min()
        dataset = dataset.groupby(labelColumn).sample(n = min_count*fraction, random_state=0).reset_index(drop = True)

    elif fraction < 1:
        dataset = dataset.sample(frac = fraction, random_state=0)

    if shuffle:
        dataset = dataset.sample(frac = 1, random_state=0)

    dataset.reset_index(drop = True, inplace = True)

    return dataset


In [14]:
def FilesDictFactory(BaseModelingFlag: AbstractFlag):

    files = {
             r"other Device/synthetic Data.csv": Flag("other Device", parents = [BaseModelingFlag, flag_label_other_Device])
    }

    return files

In [16]:
flag = Flag("Base Modeling", parents = [flag_preProcessing, flag_dummy, flag_modeling, flag_dropColumns])
files = FilesDictFactory(flag)

rawData = filesToDataFrame(files)

# By setting shuffle=False, the data are arranged in chronological order (ascending). According to Modeling Piepelin "90 Order DataFrame"
data = prepareDataset(rawData, shuffle=False, balance=True)

In [17]:
# "data" dataframe will be used to generate synthetic data with Markov model
data

Unnamed: 0,Length Packet,Length Header,Length MS Data,Length Service Data,CH 37,CH 38,CH 39,AD Manufacturer Specific,AD Flags,AD Tx Power Level,...,PDU ADV_NONCONN_IND,PDU ADV_SCAN_IND,PDU Other,CT 07,CT 12,CT Other,ST 3,ST 5,ST Other,Label
0,49,23,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,other Device
1,49,23,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,other Device
2,49,23,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,other Device
3,49,23,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,other Device
4,49,23,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,other Device
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,35,9,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,other Device
599996,49,23,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,other Device
599997,49,23,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,other Device
599998,51,25,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,other Device
