Create the mobile traffic chunks (called samples) with the same length (5 minutes, 4 minutes, 3 minutes, 2 minutes, 1 minutes)

## Import packages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
# this path depends on your setup (need to contain sources folder)
root_path = '/content/drive/Shared drives/MAppGraph/data'

## Config setting

In [None]:
'''
Define set of hyper-parameters
List of tuples (duration, overlap)
'''
params = [(5, 3), (4, 2), (3, 1), (2, 0), (1, 0)]

## Create the mobile traffic chunks with the same length

In [None]:
# folder that contains source data
sources_folder = os.path.join(root_path, 'sources')

In [None]:
# loop over set of hyper-parameters
for duration, overlap in params:
  # folder that contain samples of one set of parameters
  param_folder = os.path.join(root_path, '%d_%d'%(duration, overlap))
  
  # check whether the data is already generated or not
  if not os.path.exists(param_folder):
    os.mkdir(param_folder)

    # create folder to contain samples
    samples_folder = os.path.join(param_folder, 'samples')
    os.mkdir(samples_folder)

    # loop over each app to generate samples
    for app in os.listdir(sources_folder):
      print('App: ', app)
      app_sources_folder = os.path.join(sources_folder, app)

      # create folder contain samples for each app
      app_samples_folder = os.path.join(samples_folder, app)
      if not os.path.exists(app_samples_folder):
        os.mkdir(app_samples_folder)

      for filename in os.listdir(app_sources_folder):
        print('Processing %s ...' % filename)
        index = 1

        file_path = os.path.join(app_sources_folder, filename)
        df = pd.read_csv(file_path, index_col=0)
        base = df['time'].iloc[0]
        end = df['time'].iloc[-1]
      
        while ((index - 1)*(duration - overlap) + duration)*60 + base < end:
          start_time = base + (index-1)*(duration - overlap)*60
          end_time = start_time + duration*60
          df_ = df[(df['time'] >= start_time) & (df['time'] <= end_time)].reset_index(drop=True)

          # save a sample as csv file
          sample_filename = "_".join(filename.split('_')[:-2]) + '_' + filename.split('_')[-2] + '_' + str(index) + '.csv'
          sample_path = os.path.join(app_samples_folder, sample_filename)
          df_.to_csv(sample_path, index=True)

          index += 1
    
      print('...................................................')

App:  reddit
Processing reddit_1_94.288m.csv ...
Processing reddit_2_97.017m.csv ...
Processing reddit_3_96.49m.csv ...
Processing reddit_4_96.915m.csv ...
Processing reddit_5_97.594m.csv ...
Processing reddit_6_97.603m.csv ...
Processing reddit_7_97.124m.csv ...
Processing reddit_8_96.595m.csv ...
Processing reddit_9_137.378m.csv ...
Processing reddit_10_137.237m.csv ...
Processing reddit_11_136.941m.csv ...
Processing reddit_12_137.275m.csv ...
Processing reddit_13_136.872m.csv ...
Processing reddit_15_137.323m.csv ...
Processing reddit_14_137.06m.csv ...
Processing reddit_16_134.529m.csv ...
...................................................
App:  pinterest
Processing pinterest_1_112.41m.csv ...
Processing pinterest_2_44.063m.csv ...
Processing pinterest_3_113.315m.csv ...
Processing pinterest_4_17.171m.csv ...
Processing pinterest_5_18.382m.csv ...
Processing pinterest_6_37.66m.csv ...
Processing pinterest_7_181.553m.csv ...
Processing pinterest_8_41.332m.csv ...
Processing pinter