In [2]:
import pandas as pd

## Data: How is the .csv organized?

For each .csv file the following apply:
* each row represents the data of a single vehicle
* the first 10 columns in the 1st row include the columns’ names
* the first 4 columns include information about the trajectory like the unique trackID, the type of vehicle, the distance traveled in meters and the average speed of the vehicle in km/h
* the last 6 columns are then repeated every 6 columns based on the time frequency. For example, column_5 contains the latitude of the vehicle at time column_10, and column­­­_11 contains the latitude of the vehicle at time column_16.
* Speed is in km/h, Longitudinal and Lateral Acceleration in m/sec2 and time in seconds.

In [3]:
# 1. lets blindly try to read the data into a dataframe

df = pd.read_csv("../data/test_data.csv", sep=";")

ParserError: Error tokenizing data. C error: Expected 3011 fields in line 3, saw 10511


The error is caused by the fact that rows do **not** have equal number of columns. 

In [4]:
with open("../data/test_data.csv", 'r') as file:
    lines = file.readlines()

In [5]:
print(f"The number of rows/lines is {len(lines)}")

The number of rows/lines is 853


In [6]:
print(lines[0]) # column names
print(lines[0].strip('\n').strip().strip(';').split(';')) # columns names as a list

track_id; type; traveled_d; avg_speed; lat; lon; speed; lon_acc; lat_acc; time

['track_id', ' type', ' traveled_d', ' avg_speed', ' lat', ' lon', ' speed', ' lon_acc', ' lat_acc', ' time']


In [7]:
lines_as_lists = [line.strip('\n').strip().strip(';').split(';') for line in lines]
len(lines_as_lists)

853

In [9]:
print(f"the number of fields in row 1 is {len(lines_as_lists[1])}, row 2 is {len(lines_as_lists[2])}")

the number of fields in row 1 is 3010, row 2 is 10510


The number of fields in each row is = 4 + 6 * n

* The first 4 unique values are: track_id; type; traveled_d; avg_speed
* The remaining fields are repeated sequences of: 
  * lat_1; lon_1; speed_1; lon_acc_1; lat_acc_1; time_1
  * lat_2; lon_2; speed_2; lon_acc_2; lat_acc_2; time_2
  * ...
  * lat_n; lon_n; speed_n; lon_acc_n; lat_acc_n; time_n

* n depends on the time frequency. It is different for each row.


In [12]:
no_field_max = 0

for row in lines_as_lists:
    if len(row) > no_field_max:
        no_field_max = len(row)

print(f"the maximum number of fields is {no_field_max}")
largest_n = int((no_field_max-4)/6)
print(f"the largest n = {largest_n}")

the maximum number of fields is 98410
the largest n = 16401


We can divide each row into 2 parts:
1.  A list of the first 4 values. 
    - These will correspond to ['track_id', ' type', ' traveled_d', ' avg_speed']
2.  A matrix of size 6*n of the remaining values. 
    - These will correspond to a table with columns [' lat', ' lon', ' speed', ' lon_acc', ' lat_acc', ' time'] and n rows.
    - We will append the 'track_id' value to each row to keep track of the vehicle identity.

In [13]:
cols = lines_as_lists.pop(0)

In [14]:
track_cols = cols[:4]
trajectory_cols = ['track_id'] + cols[4:]

print(track_cols)
print(trajectory_cols)


['track_id', ' type', ' traveled_d', ' avg_speed']
['track_id', ' lat', ' lon', ' speed', ' lon_acc', ' lat_acc', ' time']


In [ ]:
track_info = []
trajectory_info = []

for row in lines_as_lists:
    track_id = row[0]

    # add the first 4 values to track_info
    track_info.append(row[:4]) 

    remaining_values = row[4:]
    # reshape the list into a matrix and add track_id
    trajectory_matrix = [ [track_id] + remaining_values[i:i+6] for i in range(0,len(remaining_values),6)]
    # add the matrix rows to trajectory_info
    trajectory_info = trajectory_info + trajectory_matrix


In [17]:
df_track = pd.DataFrame(data= track_info,columns=track_cols)

df_track.head(20)

Unnamed: 0,track_id,type,traveled_d,avg_speed
0,1,Car,134.88,24.278704
1,2,Car,426.97,21.958339
2,3,Car,206.58,8.263246
3,4,Car,261.45,30.361735
4,5,Taxi,264.12,16.979263
5,6,Car,251.9,28.516707
6,7,Car,245.85,30.731733
7,8,Motorcycle,240.66,33.322447
8,9,Taxi,186.73,26.465008
9,10,Motorcycle,234.34,35.150658


In [18]:
df_trajectory = pd.DataFrame(data= trajectory_info,columns=trajectory_cols)

df_trajectory.head()

Unnamed: 0,track_id,lat,lon,speed,lon_acc,lat_acc,time
0,1,37.979513,23.736025,35.3204,-0.2996,-0.0175,14.0
1,1,37.97951,23.736027,35.299,-0.2788,-0.0188,14.04
2,1,37.979508,23.73603,35.28,-0.2656,-0.02,14.08
3,1,37.979505,23.736032,35.2624,-0.2589,-0.0213,14.12
4,1,37.979502,23.736035,35.246,-0.246,-0.0225,14.16
