# GAN Model

The purpose of this notebook is to attempt to create, and then manipulate, platoon data. The method to accomplish this is tentitivley set to be in the form of a General Adversarial Network (GAN). 

In [1]:
import pandas as pd

# Preprocessing

This section takes the platoon data and creates features of equal length.

In [None]:
df = pd.read_pickle('CeneriData/2003_2019_platoon_h2.6_lane4.pkl')

In [None]:
df[df.Length > 1].Length.hist(bins=10)

In [None]:
df = df.drop(columns='Lane')

In [None]:
df.Length.value_counts()

In [None]:
df = df.drop(columns='Platoon')

In [None]:
df[df.CLASS == 1].CLASS.value_counts()

In [None]:
#df = df.drop(columns=['AX_W', 'AX_DIST']) #After expanding the features to be the same length, keeping these columns is too large

In [None]:
df.columns

In [None]:
df = df[df.Length < 10]

In [None]:
expand_list = ['CLASS', 'GW_TOT', 'LENTH', 'IVT', 'SPEED', 'AX']

In [None]:
dfs = []

In [None]:
df = df.reset_index(drop=True)

In [None]:
for expand in expand_list:
    df_temp = pd.DataFrame(df[expand].values.tolist())
    columns_tmp = []
    for i in range(1, 10):
        columns_tmp.append('{}_{}'.format(expand, i))
    df_temp.columns = columns_tmp
    dfs.append(df_temp)

In [None]:
ax_list = ['AX_W', 'AX_DIST']

In [None]:
df_small = df.drop(columns=expand_list)

In [None]:
df_small = df_small.drop(columns=ax_list)

In [None]:
dfs[0][dfs[0] == 0] = 99 #Replaces the zero with a 99 category

In [None]:
dfs[0]

In [None]:
dfs.append(df_small)

In [None]:
df_cat = pd.concat(dfs, axis= 1)

In [None]:
df_cat = df_cat.fillna(0)

In [None]:
df_ax =[]

In [None]:
for expand in ax_list:
    df_temp = pd.DataFrame(df[expand].values.tolist())
    columns_tmp = []
    for i in range(1, 10):
        columns_tmp.append('{}_{}'.format(expand, i))
    df_temp.columns = columns_tmp
    df_ax.append(df_temp)

In [None]:
zero_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
msk = df_ax[0].isna()

In [None]:
df_ax[0] = df_ax[0].where(~msk, other=pd.Series([zero_list]*df.shape[0]), axis=0)

In [None]:
msk = df_ax[1].isna()

df_ax[1] = df_ax[1].where(~msk, other=pd.Series([zero_list[:-1]]*df.shape[0]), axis=0)

In [None]:
df_ax = pd.concat(df_ax, axis= 1)

In [None]:
df_cat = pd.concat([df_ax, df_cat], axis= 1)

In [None]:
df_cat.AX_DIST_2

Finally, the Start and End variables will be replaced with day of week and time of day

In [None]:
df_cat['Weekday'] = df_cat.Start.dt.dayofweek

In [None]:
df_cat['Hour'] = df_cat.Start.dt.hour

In [None]:
df_cat = df_cat.drop(columns=['Start', 'End'])

In [None]:
df_cat.isna().sum()

In [None]:
pd.to_pickle(df_cat,'CeneriData/cleaned_2003_2019_platoon.pkl')

# CTGAN

This section will test to see if the PATE GAN can be applied to our data.

In [2]:
df = pd.read_pickle('CeneriData/cleaned_2003_2019_platoon.pkl')

Below is converting the df into one that includes all columns for axes weight and axes distances

In [3]:
old_ax = []
for ax in ax_list:
    for i in range(1, 10):
        old_ax.append('{}_{}'.format(ax, i))

NameError: name 'ax_list' is not defined

In [None]:
old_ax

In [None]:
df_cat = df_cat.drop(columns=old_ax)

In [None]:
pd.to_pickle(df_cat,'CeneriData/cleaned_2003_2019_platoon_fullax.zip')

In [None]:
df = pd.read_pickle('CeneriData/cleaned_2003_2019_platoon_fullax.zip')

The beginning of the CTGAN 

In [3]:
discrete_columns = ['CLASS_1', 'CLASS_2','CLASS_3','CLASS_4','CLASS_5','CLASS_6','CLASS_7','CLASS_8', 'CLASS_9', 'Length',
                    'Weekday','Hour','AX_1','AX_2','AX_3','AX_4','AX_5','AX_6','AX_7','AX_8','AX_9']

In [4]:
from ctgan import CTGANSynthesizer



In [None]:
ctgan = CTGANSynthesizer()

In [5]:
ax_cols = []
for i in range(1, 10):
    ax_cols.append('{}_{}'.format('AX_W', i))
    #for j in range(0, 10):
        #ax_cols.append('{}_{}_{}'.format('AX_W', i, j))

In [6]:
for i in range(1, 10):
    ax_cols.append('{}_{}'.format('AX_DIST', i))
    #for j in range(0, 9):
        #ax_cols.append('{}_{}_{}'.format('AX_DIST', i, j))

In [7]:
df_noax = df.drop(columns = ax_cols)

In [8]:
no_col = []
for col in df_noax.columns:
    if col[-1].isdigit() and int(col[-1]) > 5:
        no_col.append(col)

In [9]:
df_sm = df_noax.drop(columns=no_col)

In [10]:
df_sm = df_sm[df_sm.Length <= 5]

In [11]:
import random

In [12]:
df_sm

Unnamed: 0,CLASS_1,CLASS_2,CLASS_3,CLASS_4,CLASS_5,GW_TOT_1,GW_TOT_2,GW_TOT_3,GW_TOT_4,GW_TOT_5,...,SPEED_4,SPEED_5,AX_1,AX_2,AX_3,AX_4,AX_5,Length,Weekday,Hour
0,99,0.0,0.0,0.0,0.0,4045,0.0,0.0,0.0,0.0,...,0.0,0.0,2,0.0,0.0,0.0,0.0,1,2,0
1,11,0.0,0.0,0.0,0.0,14932,0.0,0.0,0.0,0.0,...,0.0,0.0,2,0.0,0.0,0.0,0.0,1,2,1
2,99,0.0,0.0,0.0,0.0,3973,0.0,0.0,0.0,0.0,...,0.0,0.0,2,0.0,0.0,0.0,0.0,1,2,1
3,11,0.0,0.0,0.0,0.0,15085,0.0,0.0,0.0,0.0,...,0.0,0.0,2,0.0,0.0,0.0,0.0,1,2,1
4,11,0.0,0.0,0.0,0.0,15535,0.0,0.0,0.0,0.0,...,0.0,0.0,2,0.0,0.0,0.0,0.0,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10302520,12,0.0,0.0,0.0,0.0,22580,0.0,0.0,0.0,0.0,...,0.0,0.0,3,0.0,0.0,0.0,0.0,1,3,23
10302521,11,0.0,0.0,0.0,0.0,12360,0.0,0.0,0.0,0.0,...,0.0,0.0,2,0.0,0.0,0.0,0.0,1,3,23
10302522,99,0.0,0.0,0.0,0.0,3670,0.0,0.0,0.0,0.0,...,0.0,0.0,2,0.0,0.0,0.0,0.0,1,3,23
10302523,99,0.0,0.0,0.0,0.0,4170,0.0,0.0,0.0,0.0,...,0.0,0.0,2,0.0,0.0,0.0,0.0,1,3,23


In [11]:
ivt_list = ['IVT_1', 'IVT_2', 'IVT_3', 'IVT_4', 'IVT_5']
df_int = df_5.drop(columns=ivt_list).astype(int)
df_sm = pd.concat([df_5[ivt_list], df_int], axis=1)

In [20]:
df_sm[df_sm.Length == 3].groupby(['Weekday']).Length.value_counts()

Weekday  Length
0        3         46379
1        3         55228
2        3         55363
3        3         44934
4        3         36267
5        3         16411
6        3           709
Name: Length, dtype: int64

Below is an attempt to parallelize different sizes of the dataset being used with the CTGAN

In [54]:
def parallelize_ctgan(df=None, func=None, n_cores=20, lengths=[]):
    df_split = []
    for length in lengths:
        tmp_df = df[df.Length == length]
        #Find all the columns that only have one value in this df and drop these columns
        if length == (1 or 2):
            for day in list(tmp_df.Weekday.unique()):
                day_df = tmp_df[tmp_df.Weekday == day]
                df_split.append(day_df)
        else:
            df_split.append(tmp_df)
    ctx = mp.get_context('spawn')
    pool = ctx.Pool(n_cores)
    pool.map(func, df_split)
    pool.close()
    pool.join()

In [58]:
def ctgan_300(df):
    from ctgan import CTGANSynthesizer
    import pandas as pd
    ctgan_300 = CTGANSynthesizer()
    print(df.Length.unique())
    length = df.Length.unique()[0]
    if length == (1 or 2):
        day = df.Weekday.unique()[0]
    else:
        day = "all"
    nunique = df.apply(pd.Series.nunique)
    cols_to_drop = nunique[nunique == 1].index
    df = df.drop(cols_to_drop, axis=1)
    discrete_columns = ['CLASS_1', 'CLASS_2','CLASS_3','CLASS_4','CLASS_5','CLASS_6','CLASS_7','CLASS_8', 'CLASS_9', 'Length',
                    'Weekday','Hour','AX_1','AX_2','AX_3','AX_4','AX_5','AX_6','AX_7','AX_8','AX_9']
    #Keep only discrete columns that are kept in this dataframe
    tmp_discrete_columns = list(set(discrete_columns).intersection(list(df.columns)))
    print('Starting {} length, day {} fit'.format(length, day))
    ctgan_300.fit(df, tmp_discrete_columns) #Go with the default of 
    ctgan_300.save('CTGAN_Models/ctgan_length{}_day{}_epoch300.pkl'.format(length, day))

In [56]:
import multiprocess as mp

In [59]:
#Take df_sm and parallelize it with different amounts of the 
parallelize_ctgan(df_sm, ctgan_300, n_cores=20, lengths=lengths)

KeyboardInterrupt: 

In [14]:
lengths = list(df_sm.Length.unique())

In [21]:
lengths

[1, 2, 3, 4, 5]

In [23]:
lengths = lengths[:-1]

In [51]:
df_sm[df_sm.Length == 1].Length.unique()[0]

1

In [27]:
ivt_list = ['IVT_1', 'IVT_2', 'IVT_3', 'IVT_4', 'IVT_5']
df_int = df_sm.drop(columns=ivt_list).astype(int)
df_sm = pd.concat([df_sm[ivt_list], df_int], axis=1)