In [90]:
import pandas as pd
import json
import os
import numpy as np
import pickle
import json
import scipy
from sklearn.preprocessing import OneHotEncoder

# ignore warnings jupyter notebook
import warnings
warnings.filterwarnings('ignore')

In [91]:
# @todo: modify data to include all the dates from start to end, fill missing values with 0 do not remove any rows

# --------- Functions ----------

In [92]:
# function to check if the data is complete
def check_hours(df):
    """"Function to double check if hours are complete, delete hours with more than 3 zeroes"""

    minutes = ['00:00', '05:00', '10:00', '15:00', '20:00', '25:00', '30:00', '35:00', '40:00', '45:00', '50:00','55:00']
    for date in df['timestamp'].dt.date.unique():
        current_day = df[df['timestamp'].dt.date == date]
        for hour in current_day['timestamp'].dt.hour.unique():
            current_hour = current_day[current_day['timestamp'].dt.hour == hour]
            if (len(current_hour) != 12): # 12 because we have 12, 5 minutes intervals
                df.drop(current_hour.index, inplace=True)
            # Q: should we delete hours with more than 3 zeroes? What would be the impact? A lot of zeroes in the cars column means that the data is sparse ?
            # try:
            #     if current_hour['cars'].value_counts()[0.0] > 3: # delete hours with more than 3 zeroes in the cars column 
            #         df.drop(current_hour.index, inplace=True)
            # except Exception as e:
            #     continue
    return df

In [93]:
# read folder with all csv files and create one df from it (one per intersection)
def read_folder(current_intersection, configs, trac, direc):
    """Function to read all csv files (which is one per month) in the path and create one df from it."""

    print("Starting intersection: " + str(current_intersection))  # note which intersection its working on
    path = os.path.join(configs['data_folder'],current_intersection)  # define path to intersection folder (where all csv files are)
    print(path)

    df = pd.DataFrame(columns=['timestamp', 'cars'])  # create df to save everything in
    # loop through all files that end with csv:
    for file in os.listdir(path):  # read all files:
        if file.endswith(".csv"):  # for all csv files in the folder

            current_month = pd.read_csv(os.path.join(path,file), delimiter=";") # read csv file
            cols = configs["trajectories"][trac][direc][current_intersection] + [current_intersection]  # get sensors defined in config file + intersection name for dates (check csv files))
            current_month = current_month[cols]  # only keep interesting columns
            # some cleaning:
            current_month = current_month[:-1]  # last row is totals
            current_month = current_month.fillna(0)  # fill NA values with 0

            # remove sensor errors:
            # Q: why keep range 0 to 600?
            # Q: why shift 4?
            for sensor in configs['trajectories'][trac][direc][current_intersection]:
                current_month[sensor] = current_month[sensor].apply(lambda x: x if x <= 600 else 0)  # remove sensor errors
                current_month[sensor] = current_month[sensor].loc[current_month[sensor].shift(4) != current_month[sensor]] # remove sensor errors 
            
            current_month[configs['trajectories'][trac][direc][current_intersection]] = \
                current_month[configs['trajectories'][trac][direc][current_intersection]].clip(-1,401)  # clip values between 0 and 400

            # sum all sensors: 
            current_month['cars'] = current_month[configs['trajectories'][trac][direc][current_intersection]].sum(axis=1)  # sum of all interesting columns
            current_month = current_month[[current_intersection, "cars"]]  # only keep name and total amount of cars
            current_month.columns = ['timestamp', 'cars']  # rename to timestamp for general format
            
            # add to base df:
            df = pd.concat([df, current_month])

    df['timestamp'] = pd.to_datetime(df['timestamp'])  # format as dt
    df['cars'] = df['cars'].clip(-1, len(configs['trajectories'][trac][direc][current_intersection] * 150))  # no intersection could be able to process sensors*150 cars
    df = df.sort_values(by='timestamp')  #sort by timestamp
    df = df.dropna()  # extra check to drop na values
    df = df.loc[(df['timestamp'] > '2014-12-31') & (df['timestamp'] < '2020-05-31')]  # delete faulty datapoints outside scope
    df = df.reset_index(drop=True)
    df = check_hours(df)  # try this afterwards
    return df



In [94]:
# function to save the processed data for GNN training in h5 format
def save_GNN_processed_data(raw_data,save_path):
    # 1. from raw data dictionary create a dataframe with the values of the dictionary
    # 2. from values in the dictionary change column name 'car' to key name 
    # 3. convert timestamp to datetime64
    # 4. join df's on timestamp

    first_intersection = list(raw_data.keys())[0]
    base_df = raw_data[first_intersection]['timestamp']
    base_df = pd.DataFrame(base_df)
    # base_df['timestamp'] = np.datetime64(base_df['timestamp'])

    for intersection in raw_data:
        df = pd.DataFrame(raw_data[intersection])
        df = df.rename(columns={'cars': intersection})
        # df['timestamp'] = np.datetime64(df['timestamp'])
        base_df = pd.merge(base_df, df, on='timestamp', how='inner')


    base_df['timestamp'] = pd.to_datetime(base_df['timestamp'])
    base_df['timestamp'] = np.array(base_df['timestamp'])


    # set timestamp as index and remove the name of the index
    base_df = base_df.set_index('timestamp')
    base_df.index.name = None

    # save the raw data to a h5 file
    base_df.to_hdf(save_path, key='df', mode='w')

In [95]:
# function to aggregate data into fpds per hour, a new probability column is added with the probability of a car passing through the intersection per 5 minutes
def fpd(df, hours=1):
    """Function to aggregate a df of traffic info into one with fpds per window of x hours (this means 12*hours values)"""
    freq = str(hours) + "H"
    aggregate = df.groupby(pd.Grouper(freq=freq, key='timestamp')).sum()  # aggregate by 1 hour
    df = pd.merge(df, aggregate, on='timestamp', how='left')  # merge with normal df
    df = df.fillna(method='ffill')  # fill with previous number
    df.columns = ['timestamp', 'cars', 'total']
    df['cars'][df['cars'] < 0] = 0  # some inconistencies in the data where cars could be negative
    df['total'][df['total'] <= 0] = 1  # some inconsistencies in the data where total cars could be negative, set to 1 to avoid problems
    df['prob'] = df['cars'] / df['total'] # calculate probability
    return df

In [96]:
# weeks 7,
# hours 24,
# [ data, timesteps ]
# data -> example: FPDs (list of 12 points) of all mondays 00:00 to 01:00 for 4 years with the list of probabilities for each FPD
# timesteps -> example: date associated with each FPD of all mondays 00:00 to 01:00 for 4 years
def create_timeslot_array(data,ohe_intersection,window=12):
    
    """Function to reshape into numpy array shaped like (samples,window); e.g. 120 datapoints/12 (60min/5mins=12) = 10 FPDs.
    This is neccesary to create the bhattacharyya matrices. Misfunctions when an hour in the data has more or fewer than 12 datapoints (happens with double timestamps or missing data)
    Should be fixed by adding better data protection in the read_data function & rerunning the vlogbroker to output raw sensor values."""

    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data['weekday'] = data['timestamp'].apply(lambda x: x.weekday())
    data['hour'] = data['timestamp'].apply(lambda x: x.hour)
    data_array = [] # create empty array
    for i in range(7):
        timeslots = []
        for hour in range(24):
            try:
                datapoint = data[(data['weekday'] == i) & (data['hour'] == hour)]
                x = np.array(datapoint['prob'])
                x = x.reshape(int(len(x)/window), window)  # data should be complete and divisible by 12, otherwise it fails.
                ohe_intersection_array = np.repeat(ohe_intersection, x.shape[0], axis=0)
                hour_array = np.array([((hour+1)/24)]*x.shape[0]).reshape(-1,1)  # add in hour of day
                week_array = np.array([((i+1)/7)]*x.shape[0]).reshape(-1,1)  # add in weekday
                x = np.concatenate((x, hour_array, week_array,ohe_intersection_array), axis=1)  # add in normalized hour of day and weekday
                dates = sorted(set(datapoint['timestamp'].apply(lambda x: x.floor(freq='H'))))  # add in hourly timestamp
                timeslots.append([x, dates])
            except Exception as e:
                print("Exception in create_timeslot_array: ", i, hour)
                print(e)
        data_array.append(timeslots)
    # output structure: data_array[7weekdays][24hours]; e.g. data_array[0][9] is data for monday mornings 9 am.
    return data_array

In [97]:
# function to save data to pickle file
def save_pickle(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

In [98]:
# function to load data from pickle file
def load_pickle(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

## -------- MAIN ----------

In [99]:
# load configs
with open(r"../utils/configs.json", 'r') as f:
        configs = json.load(f)

final_results = {}  # dictionary to store the results

# loop over all trajectories and directions saperately 
for trajectory in configs['trajectories']:

    # define one hot encoder for the intersections
    enc_intersection = OneHotEncoder(handle_unknown='ignore')
    intersections_list = list(configs['trajectories'][trajectory]['North'].keys())
    enc_intersection.fit(np.array(intersections_list).reshape(-1,1))


    final_results[trajectory] = {}
    for direction in configs['trajectories'][trajectory].keys():
        
        # 1. first read the raw data from the pickle files and create a dictionary with the data:
        raw_data = {}
        for intersection in configs['trajectories'][trajectory][direction]:
            raw_data[intersection] = read_folder(intersection, configs, trajectory, direction)
            save_path = f"../data/hauge/processed/GNN_raw_data_{direction}_{trajectory}.h5" # path to save the processed raw data to a h5 file for GNNs
            save_GNN_processed_data(raw_data, save_path) # save the raw data to a h5 file

        # 2. create FPDs from dictionary 
        fpds = {}
        fpd_hour = 1 # interval in hours to aggregate the data
        for intersection in raw_data:
            fpds[intersection] = fpd(raw_data[intersection], fpd_hour)

        # 3. create timeslot arrays for further processing:
        featured_fpds = {}  # array with shape (weeks, hours) containing the FPDs for each hour of each week for all days from 2018 to 2022
        window_size = 12 # window size in 5 minute intervals 
        for intersection in fpds:
            ohe_intersection = enc_intersection.transform(np.array([[intersection]])).toarray() # one hot encode the intersection
            fpds_processed = create_timeslot_array(fpds[intersection], ohe_intersection, window_size) # create timeslot array
            featured_fpds[intersection] = fpds_processed
            
        
        # 4. save the featured fpds to a pickle file
        featured_fpds_save_path = f"../data/hauge/processed/featured_fpds_{direction}_{trajectory}.pickle" # path to save the processed fpds to a pickle file
        save_pickle(featured_fpds, featured_fpds_save_path) # save the fpds to a pickle file


        # 5. save in final results dictionary
        final_results[trajectory][direction] = featured_fpds


Starting intersection: K502
../data/hauge/K502
Starting intersection: K504
../data/hauge/K504
Starting intersection: K503
../data/hauge/K503
Starting intersection: K263
../data/hauge/K263
Starting intersection: K556
../data/hauge/K556
Starting intersection: K557
../data/hauge/K557
Starting intersection: K559
../data/hauge/K559
Starting intersection: K561
../data/hauge/K561
Starting intersection: K198
../data/hauge/K198
Starting intersection: K502
../data/hauge/K502
Starting intersection: K504
../data/hauge/K504
Starting intersection: K503
../data/hauge/K503
Starting intersection: K263
../data/hauge/K263
Starting intersection: K556
../data/hauge/K556
Starting intersection: K557
../data/hauge/K557
Starting intersection: K559
../data/hauge/K559
Starting intersection: K561
../data/hauge/K561
Starting intersection: K198
../data/hauge/K198
Starting intersection: K704
../data/hauge/K704
Starting intersection: K702
../data/hauge/K702
Starting intersection: K703
../data/hauge/K703
Starting inte

# ========== EXTRA =========

## ====== 1. Data Exploration ========

In [20]:
# all dates are correct
for k in raw_data.keys():
    print(k,'-->', raw_data[k]['timestamp'].iloc[0], "---" , raw_data[k]['timestamp'].iloc[-1])

K704 --> 2018-01-01 00:00:00 --- 2020-03-31 23:55:00
K702 --> 2016-01-12 16:00:00 --- 2020-03-31 23:55:00
K703 --> 2018-01-01 00:00:00 --- 2020-03-31 23:55:00
K159 --> 2018-01-01 01:00:00 --- 2020-03-31 23:55:00
K182 --> 2018-01-01 01:00:00 --- 2020-03-31 23:55:00
K183 --> 2018-01-01 01:00:00 --- 2020-03-31 23:55:00
K128 --> 2018-01-01 01:00:00 --- 2020-03-31 23:55:00
K139 --> 2018-01-01 01:00:00 --- 2020-03-31 23:55:00
K104 --> 2018-01-01 01:00:00 --- 2020-03-31 23:55:00
K101 --> 2018-01-01 01:00:00 --- 2020-03-31 23:55:00
K206 --> 2018-01-01 01:00:00 --- 2020-03-31 23:55:00
K074 --> 2018-01-01 00:00:00 --- 2020-03-31 23:55:00
K414 --> 2018-01-01 01:00:00 --- 2020-03-31 23:55:00
K415 --> 2018-01-01 01:00:00 --- 2020-03-31 23:55:00
K250 --> 2018-01-01 01:00:00 --- 2020-03-31 23:55:00


## ======= 2. METER-LA and PEMS-BAY data processing =======

In [200]:
# read the h5 file from meter-la data and convert to dictionary with each column name as key

df.head()

Unnamed: 0,773869,767541,767542,717447,717446,717445,773062,767620,737529,717816,...,772167,769372,774204,769806,717590,717592,717595,772168,718141,769373
2012-03-01 00:00:00,64.375,67.625,67.125,61.5,66.875,68.75,65.125,67.125,59.625,62.75,...,45.625,65.5,64.5,66.428571,66.875,59.375,69.0,59.25,69.0,61.875
2012-03-01 00:05:00,62.666667,68.555556,65.444444,62.444444,64.444444,68.111111,65.0,65.0,57.444444,63.333333,...,50.666667,69.875,66.666667,58.555556,62.0,61.111111,64.444444,55.888889,68.444444,62.875
2012-03-01 00:10:00,64.0,63.75,60.0,59.0,66.5,66.25,64.5,64.25,63.875,65.375,...,44.125,69.0,56.5,59.25,68.125,62.5,65.625,61.375,69.857143,62.0
2012-03-01 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-03-01 00:20:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [215]:
# function to read the data from the h5 file and convert to dictionary with each column name as key
def read_h5(df):
    data_dict = {} # create empty dictionary for storing the data
    # make datetime object using year, month, day, hour, minute, second
    # reset index and add timestamp column
    df.index = pd.to_datetime(df.index.year*10000000000 + df.index.month*100000000 + df.index.day*1000000 + df.index.hour*10000 + df.index.minute*100 + df.index.second, format='%Y%m%d%H%M%S')
    df['timestamp'] = df.index
    df.reset_index(drop=True, inplace=True)
    # create dictionary with each column name as key
    for column in df.columns:
        temp_df = pd.DataFrame()
        temp_df['timestamp'] = df['timestamp']
        temp_df['cars'] = df[column].values
        data_dict[column] = temp_df

    return data_dict

In [228]:
# read data of meter-la 
load_path = '../data/METR-LA/metr-la.h5'
save_path = '../data/METR-LA/processed/OWRI_df_format.pickle'
df = pd.read_hdf(load_path, 'df')
df.head()

Unnamed: 0,773869,767541,767542,717447,717446,717445,773062,767620,737529,717816,...,772167,769372,774204,769806,717590,717592,717595,772168,718141,769373
2012-03-01 00:00:00,64.375,67.625,67.125,61.5,66.875,68.75,65.125,67.125,59.625,62.75,...,45.625,65.5,64.5,66.428571,66.875,59.375,69.0,59.25,69.0,61.875
2012-03-01 00:05:00,62.666667,68.555556,65.444444,62.444444,64.444444,68.111111,65.0,65.0,57.444444,63.333333,...,50.666667,69.875,66.666667,58.555556,62.0,61.111111,64.444444,55.888889,68.444444,62.875
2012-03-01 00:10:00,64.0,63.75,60.0,59.0,66.5,66.25,64.5,64.25,63.875,65.375,...,44.125,69.0,56.5,59.25,68.125,62.5,65.625,61.375,69.857143,62.0
2012-03-01 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-03-01 00:20:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [230]:
# convert to dictionary
OWRI_df_format = read_h5(df)
# save data to pickle file
with open(save_path, 'wb') as f:
    pickle.dump(OWRI_df_format, f)

In [235]:
# read data of pems-bay
load_path = '../data/PEMS-BAY/pems-bay.h5'
save_path = '../data/PEMS-BAY/processed/OWRI_df_format.pickle'
df = pd.read_hdf(load_path)
df.head()

sensor_id,400001,400017,400030,400040,400045,400052,400057,400059,400065,400069,...,409525,409526,409528,409529,413026,413845,413877,413878,414284,414694
2017-01-01 00:00:00,71.4,67.8,70.5,67.4,68.8,66.6,66.8,68.0,66.8,69.0,...,68.8,67.9,68.8,68.0,69.2,68.9,70.4,68.8,71.1,68.0
2017-01-01 00:05:00,71.6,67.5,70.6,67.5,68.7,66.6,66.8,67.8,66.5,68.2,...,68.4,67.3,68.4,67.6,70.4,68.8,70.1,68.4,70.8,67.4
2017-01-01 00:10:00,71.6,67.6,70.2,67.4,68.7,66.1,66.8,67.8,66.2,67.8,...,68.4,67.4,68.4,67.5,70.2,68.3,69.8,68.4,70.5,67.9
2017-01-01 00:15:00,71.1,67.5,70.3,68.0,68.5,66.7,66.6,67.7,65.9,67.8,...,68.5,67.5,68.5,67.5,70.4,68.7,70.2,68.4,70.8,67.6
2017-01-01 00:20:00,71.7,67.8,70.2,68.1,68.4,66.9,66.1,67.7,66.1,67.8,...,68.5,67.7,68.5,67.4,69.6,69.1,70.0,68.4,71.0,67.9


In [236]:
# convert to dictionary
OWRI_df_format = read_h5(df)
# save data to pickle file
with open(save_path, 'wb') as f:
    pickle.dump(OWRI_df_format, f)

## ====== 3.Combining hauge all trajectory data =========

In [21]:
# read hauge processed data
load_path1 = '../data/hauge/processed/GNN_raw_data_North_T1.h5'
df1 = pd.read_hdf(load_path1)
# add "_N" to the column names of the North trajectories
df1.columns = [str(col) + '_N' for col in df1.columns]

load_path2 = '../data/hauge/processed/GNN_raw_data_North_T2.h5'
df2 = pd.read_hdf(load_path2)
# add "_N" to the column names of the North trajectories
df2.columns = [str(col) + '_N' for col in df2.columns]

load_path3 = '../data/hauge/processed/GNN_raw_data_South_T1.h5'
df3 = pd.read_hdf(load_path3)
# add "_S" to the column names of the South trajectories
df3.columns = [str(col) + '_S' for col in df3.columns]

load_path4 = '../data/hauge/processed/GNN_raw_data_South_T2.h5'
df4 = pd.read_hdf(load_path4)
# add "_S" to the column names of the South trajectories
df4.columns = [str(col) + '_S' for col in df4.columns]

In [57]:
# merge the dataframes on index
df = pd.concat([df1, df2, df3, df4], axis=1)

In [58]:
df = df.dropna()

In [60]:
df.shape

(55788, 48)

In [61]:
# check null percentage of each column
df.isnull().sum()/len(df)

K502_N    0.0
K504_N    0.0
K503_N    0.0
K263_N    0.0
K556_N    0.0
K557_N    0.0
K559_N    0.0
K561_N    0.0
K198_N    0.0
K704_N    0.0
K702_N    0.0
K703_N    0.0
K159_N    0.0
K182_N    0.0
K183_N    0.0
K128_N    0.0
K139_N    0.0
K104_N    0.0
K101_N    0.0
K206_N    0.0
K074_N    0.0
K414_N    0.0
K415_N    0.0
K250_N    0.0
K502_S    0.0
K504_S    0.0
K503_S    0.0
K263_S    0.0
K556_S    0.0
K557_S    0.0
K559_S    0.0
K561_S    0.0
K198_S    0.0
K704_S    0.0
K702_S    0.0
K703_S    0.0
K159_S    0.0
K182_S    0.0
K183_S    0.0
K128_S    0.0
K139_S    0.0
K104_S    0.0
K101_S    0.0
K206_S    0.0
K074_S    0.0
K414_S    0.0
K415_S    0.0
K250_S    0.0
dtype: float64

In [87]:
# sort the columns
df = df.reindex(sorted(df.columns), axis=1)
df

Unnamed: 0,K074_N,K074_S,K101_N,K101_S,K104_N,K104_S,K128_N,K128_S,K139_N,K139_S,...,K559_N,K559_S,K561_N,K561_S,K702_N,K702_S,K703_N,K703_S,K704_N,K704_S
2018-01-01 01:00:00,6.0,0.0,59.0,85.0,56.0,52.0,31.0,57.0,30.0,51.0,...,28.0,41.0,24.0,50.0,24.0,21.0,54.0,49.0,28.0,41.0
2018-01-01 01:05:00,4.0,9.0,29.0,60.0,57.0,58.0,29.0,56.0,31.0,22.0,...,22.0,44.0,21.0,53.0,42.0,35.0,55.0,59.0,24.0,41.0
2018-01-01 01:10:00,6.0,12.0,33.0,69.0,48.0,63.0,25.0,50.0,22.0,51.0,...,24.0,28.0,17.0,36.0,41.0,34.0,52.0,26.0,40.0,40.0
2018-01-01 01:15:00,0.0,15.0,55.0,72.0,51.0,49.0,20.0,52.0,13.0,50.0,...,9.0,57.0,31.0,56.0,43.0,32.0,53.0,57.0,46.0,33.0
2018-01-01 01:20:00,0.0,10.0,57.0,75.0,9.0,63.0,34.0,46.0,34.0,45.0,...,47.0,35.0,43.0,63.0,46.0,39.0,60.0,68.0,6.0,44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-31 23:35:00,3.0,3.0,21.0,12.0,14.0,20.0,10.0,12.0,19.0,10.0,...,21.0,11.0,20.0,17.0,4.0,6.0,9.0,5.0,4.0,7.0
2020-03-31 23:40:00,3.0,2.0,13.0,13.0,14.0,22.0,12.0,8.0,5.0,12.0,...,9.0,10.0,11.0,2.0,5.0,2.0,5.0,6.0,1.0,8.0
2020-03-31 23:45:00,3.0,2.0,12.0,23.0,17.0,14.0,6.0,5.0,8.0,8.0,...,20.0,9.0,16.0,15.0,7.0,6.0,4.0,11.0,8.0,2.0
2020-03-31 23:50:00,0.0,3.0,14.0,9.0,27.0,9.0,9.0,2.0,3.0,9.0,...,13.0,7.0,13.0,10.0,1.0,7.0,2.0,4.0,5.0,1.0


In [88]:
# save the raw data to a h5 file
save_path = '../data/hauge/processed/GNN_raw_data.h5'
df.to_hdf(save_path, key='df', mode='w')

## Data correction in Trejectory 2

In [111]:
# K159_path = '../data/hauge/K159/'
# correction_list = ['K159-2018-1-.csv','K159-2018-2-.csv','K159-2018-3-.csv','K159-2018-4-.csv','K159-2018-5-.csv','K159-2018-6-.csv','K159-2018-7-1.csv','K159-2018-7-2.csv']

In [112]:
# # loop over all files in the folder and correct the column names
# for ls in os.listdir(K159_path):
#     if ls in correction_list:
#         print(ls)
#         load_path = os.path.join(K159_path, ls)
#         df = pd.read_csv(load_path, sep=';')
#         df.rename(columns={'21': '021', '81': '081','51': '051', '711':'713'}, inplace=True)
#         df.to_csv(load_path, sep=';', index=False)

In [113]:
# # merge the two files for 2018-7 and save to a new file
# csv1 = pd.read_csv('../data/hauge/K159/K159-2018-7-1.csv', sep=';')
# csv2 = pd.read_csv('../data/hauge/K159/K159-2018-7-2.csv', sep=';')

In [114]:
# comb_csv = pd.concat([csv1, csv2]).reset_index(drop=True)

In [115]:
# comb_csv.to_csv('../data/hauge/K159/K159-2018-7-.csv', sep=';', index=False)