In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
import seaborn as sns
import math

def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth specified in decimal degrees using the Haversine formula.
    """
    # Convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])

    # Haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def calculate_distance(positions):
    """
    Calculate normalized cumulative distances (s values) for a list of positions.
    Each position is a tuple (latitude, longitude).
    """
    distances = []
    cumulative_distances = [0]  # Start with 0 for the first position

    # Calculate distances between consecutive points
    for i in range(1, len(positions)):
        lat1, lon1 = positions[i - 1]
        lat2, lon2 = positions[i]
        dist = haversine_distance(lat1, lon1, lat2, lon2)
        
        distances.append(dist)
        cumulative_distances.append(cumulative_distances[-1] + dist)

    # Normalize cumulative distances
    total_distance = cumulative_distances[-1]
    
    # s_values = [cd / total_distance for cd in cumulative_distances]

    return total_distance


with open('datasets/teconer_helsinki_jan2018_df.pkl', 'rb') as f:
    df = pickle.load(f)
    print(df.columns)


Index(['UnixTime', 'Timestamp', 'Latitude', 'Longitude', 'Height', 'Speed',
       'Direction', 'Ta', 'Tsurf', 'Distance', 'S1', 'S2', 'S3', 'S9', 'S10',
       'S11', 'Hour', 'Month', 'Day', 'VehicleID', 'TripID', 'Friction'],
      dtype='object')


In [9]:
# randomly sample 1000 rows
df__ = df
# sort by UnixTime
df__ = df__.sort_values(by='UnixTime')

df__[['UnixTime', 'Latitude','Longitude','Height','Speed','Direction','Ta','Tsurf','Hour','VehicleID','TripID','Friction']]

Unnamed: 0,UnixTime,Latitude,Longitude,Height,Speed,Direction,Ta,Tsurf,Hour,VehicleID,TripID,Friction
1763676,1514777137,60.299898,24.959475,35.0,82.4,197.6,1.1,0.40,3,0209,17776,0.29
1763677,1514777138,60.299700,24.959338,35.0,82.6,197.2,1.1,0.22,3,0209,17776,0.30
1763678,1514777139,60.299504,24.959214,35.0,81.8,196.2,1.1,0.13,3,0209,17776,0.31
1763679,1514777140,60.299310,24.959099,36.0,81.0,195.3,1.1,0.27,3,0209,17776,0.30
1763680,1514777142,60.299116,24.958995,36.0,80.1,194.0,1.1,0.42,3,0209,17776,0.31
...,...,...,...,...,...,...,...,...,...,...,...,...
421299,1517442029,60.273652,24.801921,27.0,87.0,251.4,-0.5,-0.90,23,2302,8348,0.68
421300,1517442030,60.273586,24.801490,27.0,87.5,252.3,-0.5,-0.63,23,2302,8348,0.68
421301,1517442031,60.273527,24.801079,28.0,85.6,253.9,-0.5,-0.72,23,2302,8348,0.68
421302,1517442033,60.273472,24.800667,29.0,85.3,254.1,-0.5,-0.87,23,2302,8348,0.68


In [19]:
# train a decision tree model using the data and predict the 'Friction' values
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# drop the 'Friction' column
X = df.drop(columns=['Friction','S1', 'S2', 'S3', 'S9', 'S10','S11','Month', 'Day', 'VehicleID', 'TripID','Distance']).to_numpy(dtype=np.float32)
y = df['Friction']


# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# train the model
model = DecisionTreeRegressor(max_depth=5, min_samples_leaf=5)
model.fit(X_train, y_train)

# predict the 'Friction' values
y_pred = model.predict(X_test)

# calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
mae = np.mean(np.abs(y_test - y_pred))
r2 = model.score(X_test, y_test)


print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2: {r2}')




Root Mean Squared Error: 0.08872895519133454
Mean Absolute Error: 0.056397897936266095
R^2: 0.7197101356771137


In [3]:
# print the number of unuique TripID
info = {}
info['Number of Trips'] = len(df['TripID'].unique())
info['Number of Records'] = len(df)
info['Friction Mean'] = df['Friction'].mean()
info['Friction Std'] = df['Friction'].std()
info['Average Speed'] = df['Speed'].mean()


# group by TripID
grouped = df.groupby('TripID')
trip_lengths = []
# calculate s values for each trip as an additional column
s_values = []
start_abs_times = []
end_abs_times = []

for name, group in grouped:
    # calculate the distance
    dist = calculate_distance(list(zip(group['Latitude'], group['Longitude'])))
    trip_lengths.append(dist)
    # sotre trip start and end time
    start_abs_times.append(group['UnixTime'].iloc[0])
    end_abs_times.append(group['UnixTime'].iloc[-1])

info['Average Trip Length'] = np.mean(trip_lengths)

print(info)

{'Number of Trips': 876, 'Number of Records': 1845547, 'Friction Mean': 0.6343880378012585, 'Friction Std': 0.1673198325555451, 'Average Speed': 20.843659684635504, 'Average Trip Length': 14.67026014138912}


In [14]:
from_time = min(start_abs_times)
to_time = max(end_abs_times)

active_trip_counts = []
for i in range(from_time, to_time, 60):
    # count the number of trips that are active at time i
    active_trips = 0
    for j in range(len(start_abs_times)):
        if start_abs_times[j] <= i and end_abs_times[j] >= i:
            active_trips += 1
    if active_trips > 0:
        active_trip_counts.append(active_trips)




# plot the histogram of active trips
# plt.plot(range(from_time, to_time, 60), active_trip_counts)

# print the average number of trips that are active at the same time
print(np.mean(active_trip_counts))



1.3465666391471338


26649

In [30]:
%matplotlib qt
plt.close('all')

# randomly sample 10K records
# sample = df.sample(1000000)
sample = df
# scatter plot the lat long or the records
# markers = [[60.235312, 24.899538],[60.237943, 25.040537], [60.209376, 24.815294]]
plt.figure(figsize=(10, 10))


plt.scatter(sample['Longitude'], sample['Latitude'], s=0.1, alpha=0.02)
# plt.scatter([m[1] for m in markers], [m[0] for m in markers], c='red', s=100, marker='x')
# mark the city Espoo with 60.205620, 24.656452
# plt.scatter(24.947722,60.167789, c='green', s=100, marker='x')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True)
# make the plot square
plt.gca().set_aspect('equal', adjustable='box')

In [49]:
%matplotlib qt
plt.close('all')

# for i in range(20):
# select samples that are within the first week of January
sample = df[(df['Timestamp'] >= '2018-01-03') & (df['Timestamp'] < '2018-01-04')]


# scatter plot the lat long or the records
# markers = [[60.235312, 24.899538],[60.237943, 25.040537], [60.209376, 24.815294]]
plt.figure(figsize=(12, 5))
plt.scatter(sample['Longitude'], sample['Latitude'], c=sample['Friction'], cmap='viridis', s=0.05, alpha=0.9, vmin=0.2, vmax=0.8)
plt.title('helsinki_day_4')
# place colorbar
plt.colorbar(label='Friction')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True)
# make the plot square
plt.gca().set_aspect('equal', adjustable='box')
# save the plot
plt.savefig('helsinki_day_4_bar.png')