In [84]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

df_raw = pd.read_csv('./podatki/bicikelj_train.csv')

display(df_raw.head(5))

Unnamed: 0,timestamp,PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE,POGAČARJEV TRG-TRŽNICA,KONGRESNI TRG-ŠUBIČEVA ULICA,CANKARJEVA UL.-NAMA,BREG,GRUDNOVO NABREŽJE-KARLOVŠKA C.,MIKLOŠIČEV PARK,BAVARSKI DVOR,TRG OF-KOLODVORSKA UL.,...,POVŠETOVA - KAJUHOVA,SOSESKA NOVO BRDO,TEHNOLOŠKI PARK,VOJKOVA - GASILSKA BRIGADA,GERBIČEVA - ŠPORTNI PARK SVOBODA,DOLENJSKA C. - STRELIŠČE,ROŠKA - STRELIŠKA,LEK - VEROVŠKOVA,VOKA - SLOVENČEVA,SUPERNOVA LJUBLJANA - RUDNIK
0,2022-08-02 13:04:00,17,17,20,26,10,10,16,7,12,...,9,7,5,4,2,9,4,8,3,1
1,2022-08-02 13:10:00,18,17,19,25,10,11,15,7,12,...,9,7,5,4,1,9,4,8,2,1
2,2022-08-02 13:14:00,17,18,19,25,9,11,16,7,12,...,9,7,5,4,1,9,4,8,2,1
3,2022-08-02 13:20:00,18,15,20,26,12,11,18,6,13,...,9,7,5,2,1,9,4,8,2,1
4,2022-08-02 13:25:00,20,16,19,23,12,10,17,6,15,...,9,7,5,2,1,9,4,8,2,1


In [85]:
def add_closest_times(df_ts, type, diff):
    df_closest_times = pd.read_csv(f'./generiraj/closest_{diff}h_{type}.csv')
    df_closest_times = df_closest_times.rename(columns={col: col+f'_closest_{diff}h' for col in df_closest_times.columns[1:]})

    # merge the two data frames on the "timestamp" column
    merged_df = pd.merge(df_ts, df_closest_times, on='timestamp', how='outer')
    return merged_df

In [86]:
def add_previous_times(df_ts, df_prev):
    
    df_prev = df_prev.rename(columns={col: col+f'_closest_1h' for col in df_prev.columns[1:]})

    # merge the two data frames on the "timestamp" column
    merged_df = pd.merge(df_ts, df_prev, on='timestamp', how='outer')
    return merged_df

In [87]:
def add_is_rainy(df):
    df_precipitation = pd.read_csv('./podatki/export.csv')
    df_precipitation = df_precipitation.loc[:, ['date', 'prcp']] 
    # Convert the date column in precipitation_df to datetime type
    # Convert the date column in precipitation_df to datetime type
    df_precipitation['date'] = pd.to_datetime(df_precipitation['date'])

    # Group precipitation data by date and check if precipitation was more than 5
    rainy_dates = df_precipitation.groupby('date').sum()['prcp'] > 5

    # Map the is_rainy values to timestamp_df based on the corresponding date
    df['is_rainy'] = df['timestamp'].dt.date.map(rainy_dates).astype(int)
    #display(df.head(5))
    return df

In [88]:
df = df_raw.copy()
def preprocess_data(df, type, diff):
    
    if type == "train":
        df = add_closest_times(df, type, 1)
        df = add_closest_times(df, type, 2)
    
    #df = add_closest_times(df, type, 3)
    #df = add_times_minus(df, "90", suff)
    # Convert 'timestamp' column to timestamptime
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Extract day, hour, minute, and second values
    df['is_august'] = (df['timestamp'].dt.month == 8).astype(int)
    #df['day'] = df['timestamp'].dt.day
    df['hour'] = df['timestamp'].dt.hour
    #df['minute'] = df['timestamp'].dt.minute
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    # df['rush_hour'] = ((df['hour'] >= 7) & (df['hour'] <= 9)) | ((df['hour'] >= 15) & (df['hour'] <= 17))
    df['is_weekend'] = ((df['dayofweek'] == 5) | (df['dayofweek'] == 6)).astype(int)
    for hour in range(24):
        df[f'hour_{hour}'] = (df['hour'] == hour).astype(int)
    
    one_hot_encoded = pd.get_dummies(df['dayofweek'], prefix='dayofweek').astype(int)
    df = pd.concat([df, one_hot_encoded], axis=1)
    
    df = add_is_rainy(df)

    #df = df.drop('day', axis=1)
    df = df.drop('hour', axis=1)
    df = df.drop('dayofweek', axis=1)
    df = df.drop('timestamp', axis=1)
    
    if type == "train":
        df = df.dropna()

    return df

    
df = preprocess_data(df, "train", 1)

df.to_csv('./test/test.csv', index=False)


In [89]:
# NORMALIZACIJA
from sklearn.preprocessing import Normalizer
    

def normalizacija_train(X):
    normalizer = Normalizer(norm='l2')
    normalizer.fit(X)
    normalized_data = normalizer.transform(X)
    X = pd.DataFrame(normalized_data, columns=X.columns)
    return X, normalizer

def normalizacija_test(X, normalizer):

    normalized_data = normalizer.transform(X)
    X = pd.DataFrame(normalized_data, columns=X.columns)
    return X

In [90]:
# STANDARDIZACIJA
from sklearn.discriminant_analysis import StandardScaler



def standardizacija_train(X):
    scaler = StandardScaler()
    # fit the scaler to the data
    scaler.fit(X)
    #print(X.columns)
    # transform the data
    df_scaled = scaler.transform(X)
    # convert the scaled data back to a dataframe
    X = pd.DataFrame(df_scaled, columns=X.columns)
    return X, scaler

def standardizacija_test(X, scaler):
    
    # fit the scaler to the data
    #print(X.columns)
    # transform the data
    df_scaled = scaler.transform(X)
    # convert the scaled data back to a dataframe
    X = pd.DataFrame(df_scaled, columns=X.columns)
    return X

In [91]:
def split_x_y(df):
    X = df.iloc[:, 83:]
    y = df.iloc[:, :83]
    
    return X, y

#X = standardizacija_train(X)
#X = normalizacija_train(X)
X, Y = split_x_y(df)

display(X.tail(5))
display(Y.tail(5))

Unnamed: 0,PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE_closest_1h,POGAČARJEV TRG-TRŽNICA_closest_1h,KONGRESNI TRG-ŠUBIČEVA ULICA_closest_1h,CANKARJEVA UL.-NAMA_closest_1h,BREG_closest_1h,GRUDNOVO NABREŽJE-KARLOVŠKA C._closest_1h,MIKLOŠIČEV PARK_closest_1h,BAVARSKI DVOR_closest_1h,TRG OF-KOLODVORSKA UL._closest_1h,MASARYKOVA DDC_closest_1h,...,hour_22,hour_23,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,is_rainy
7734,18.0,13.0,5.0,15.0,12.0,6.0,13.0,20.0,17.0,0.0,...,0,0,0,0,0,0,0,1,0,0
7735,20.0,16.0,7.0,14.0,12.0,5.0,13.0,16.0,17.0,1.0,...,0,0,0,0,0,0,0,1,0,0
7736,20.0,17.0,7.0,13.0,12.0,5.0,13.0,16.0,16.0,1.0,...,0,0,0,0,0,0,0,1,0,0
7737,20.0,15.0,7.0,12.0,12.0,5.0,13.0,16.0,17.0,1.0,...,0,0,0,0,0,0,0,1,0,0
7738,20.0,15.0,7.0,13.0,12.0,4.0,13.0,16.0,17.0,1.0,...,0,0,0,0,0,0,0,1,0,0


Unnamed: 0,PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE,POGAČARJEV TRG-TRŽNICA,KONGRESNI TRG-ŠUBIČEVA ULICA,CANKARJEVA UL.-NAMA,BREG,GRUDNOVO NABREŽJE-KARLOVŠKA C.,MIKLOŠIČEV PARK,BAVARSKI DVOR,TRG OF-KOLODVORSKA UL.,MASARYKOVA DDC,...,POVŠETOVA - KAJUHOVA,SOSESKA NOVO BRDO,TEHNOLOŠKI PARK,VOJKOVA - GASILSKA BRIGADA,GERBIČEVA - ŠPORTNI PARK SVOBODA,DOLENJSKA C. - STRELIŠČE,ROŠKA - STRELIŠKA,LEK - VEROVŠKOVA,VOKA - SLOVENČEVA,SUPERNOVA LJUBLJANA - RUDNIK
7734,18,17,14,7,7,5,10,20,25,4,...,18,3,8,3,2,17,13,13,11,7
7735,20,17,13,7,7,5,11,19,24,4,...,18,2,8,3,2,17,13,13,11,7
7736,19,18,14,5,8,3,10,20,23,5,...,17,3,8,2,2,17,13,13,11,7
7737,19,17,16,4,6,3,9,17,21,5,...,17,3,8,1,1,17,13,13,11,7
7738,20,17,15,3,6,6,9,16,21,5,...,17,3,8,2,1,19,13,13,11,7


In [92]:
from sklearn.model_selection import train_test_split

def split_train_test(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
#X_train, X_test, y_train, y_test = split_train_test(X, y)

In [93]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import cross_validate
from scipy import stats

def cross_validation(model, X, y, n_folds=5):
# Create an instance of Leave-One-Out Cross-Validation
    # Perform cross-validation on your data
    scores = cross_val_score(model, X, y, cv=n_folds, scoring='neg_mean_absolute_error')

    # Compute the average cross-validation score for each output variable
    mean_scores = -np.mean(scores, axis=0).round(3)
    std_scores = np.std(scores, axis=0).round(3)

    #print("Cross-validation scores:", mean_scores)
    data = np.array([[mean_scores, std_scores]])
    scores_df = pd.DataFrame(data, columns=['MAE mean','MAE std'])
    #display(scores_df)
    
    return scores_df

In [94]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def display_cv_scores(scores, n_folds=5):
    
    # Calculate the mean and standard deviation of the cross-validation scores
    # Convert the negative MSE and MAE scores to positive values
    mse_scores = -scores['test_neg_mean_squared_error']
    mae_scores = -scores['test_neg_mean_absolute_error']
    

    # Calculate the mean and standard deviation of the scores for each metric
    mse_mean = np.mean(mse_scores).round(4)
    mse_std = np.std(mse_scores).round(4)
    mae_mean = np.mean(mae_scores).round(4)
    mae_std = np.std(mae_scores).round(4)
    # r2_mean = np.mean(r2_scores)
    # r2_std = np.std(r2_scores)

    data = np.array([[mse_mean, mse_std, mae_mean, mae_std]])
    scores_df = pd.DataFrame(data, columns=['MSE mean','MSE std', 'MAE mean','MAE std'])
    display(scores_df)
    
    return scores_df

### Linear regression

In [95]:
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import cross_val_score
# from sklearn.multioutput import MultiOutputRegressor

# # Create a multi-output regression model
# model = MultiOutputRegressor(LinearRegression())

# score = cross_validation(model, X, y)

# model.fit(X,y)
    


### Ridge

In [96]:
# from sklearn.linear_model import LinearRegression, Ridge
# from sklearn.model_selection import cross_val_score
# from sklearn.multioutput import MultiOutputRegressor

# # Create a multi-output regression model
# model = MultiOutputRegressor(Ridge())

# score = cross_validation(model, X, y)

# model.fit(X,y)
# print(model.get_params())

### Lasso

In [97]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor

# models = []
# # Create a multi-output regression model

# for X, y in [[X_1, y_1], [X_2, y_2]]:
#     model = MultiOutputRegressor(Lasso())

#     score = cross_validation(model, X, y)

#     model.fit(X,y)
#     print(model.get_params())
#     models.append(model)

### single output

In [98]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor


single_models = []
score = pd.DataFrame(np.array([[0, 0]]), columns=['MAE mean','MAE std'])

for i, column in enumerate(Y.columns):
    x = X.copy()
    
    y = Y[column]
    model = Ridge()
    x = x[[col for col in x.columns if col.startswith(column) or not col.endswith('h')]]
    
    #x, scaler = normalizacija_train(x)
    # feature_names = scaler.get_feature_names_out()
    # print(feature_names)
    # break
    single_score = cross_validation(model, x, y)
    score += single_score
    model.fit(x,y)
    
    # coefficients = model.coef_

    # # Print the feature coefficients
    # for feature, coefficient in zip(x, coefficients):
    #     print(f"Feature: {feature}, Coefficient: {coefficient}")
    # if i == 2:
    
    
    #print(model.get_params())
    single_models.append(model)
    #single_scalers.append(scaler)
    
score = score / Y.shape[1]
display(score)

# feature_names = scalers[0][0].get_feature_names_out()
# print(123, feature_names)


Unnamed: 0,MAE mean,MAE std
0,1.565687,0.15306


### KNN

In [99]:
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.model_selection import cross_val_score
# from sklearn.multioutput import MultiOutputRegressor
# from sklearn.neighbors import KNeighborsRegressor

# models = []
# # Create a multi-output regression model

# for X, y in [[X_1, y_1], [X_2, y_2]]:
#     model = MultiOutputRegressor(KNeighborsRegressor())

#     score = cross_validation(model, X, y)

#     model.fit(X,y)
#     print(model.get_params())
#     models.append(model)

### Predict on final test set

In [100]:
df_test = pd.read_csv('./podatki/bicikelj_test.csv')

# test_df_1h = test_df[::2]
# test_df_2h = test_df[1::2] 

# test_dfs = [test_df_1h, test_df_2h]

#display(test_df_1h)

# def predict(df, model):
    
#     X_test, _ = split_x_y(df)
#     display(X_test.head(40))

#     column_names = y.columns

#     pred = model.predict(X_test).clip(min=0)
#     #pred =  np.round(pred).clip(min=0).astype(int)

#     y_pred_df = pd.DataFrame(pred, index=df.index, columns=column_names)
    
#     # write the modified DataFrame back to a CSV file
#     return y_pred_df



def predict(df, single_models):
    
    X_test, y_pred = split_x_y(df)
    column_names = y_pred.columns
    
    #y_pred = pd.DataFrame(pred, index=df.index, columns=column_names)
    for i, model in enumerate(single_models):
        #display(X_test.head(40))
        #scaler = single_scalers[i]
        # feature_names = scaler.get_feature_names_out()
        # print(123, feature_names)
        # break 
    
        x_test = X_test.copy()
        x_test = x_test[[col for col in x_test.columns if col.startswith(y_pred.columns[i]) or not col.endswith('h')]]
        
        #x_test = normalizacija_test(x_test, scaler)
        
        
        
        pred = model.predict(x_test).clip(min=0).round(0)
        
        y_pred.iloc[:, i] = pred
        #pred =  np.round(pred).clip(min=0).astype(int)
        
    
    #display(y_pred)
    return y_pred

pred_dfs = []

# PREDICTION FOR 1H
X_ts = df_test.iloc[:, 0]

test_df = df_test.copy()

test_df = add_closest_times(test_df, "test", 1)
test_df = add_closest_times(test_df, "test", 2)
#display(test_df)
test_df = preprocess_data(test_df, "test", i + 1)
test_df[1::2] = test_df[::2].values
y_pred_df_1 = predict(test_df, single_models)
pred_df_1 = pd.concat([X_ts, y_pred_df_1], axis=1)
#display(pred_df_1)
#pred_df = pred_df.dropna()
pred_dfs.append(pred_df_1)



# ---------------------- #

# PREDICTION FOR 2H

# PREDICTION FOR 1H
X_ts = df_test.iloc[:, 0]

test_df = df_test.copy()

test_df = add_previous_times(test_df, pred_df_1)
test_df = add_closest_times(test_df, "test", 2)

test_df = preprocess_data(test_df, "test", i + 1)
y_pred_df_2 = predict(test_df, single_models)
pred_df_2 = pd.concat([X_ts, y_pred_df_2], axis=1)
#display(pred_df_2)
#pred_df = pred_df.dropna()
#pred_dfs.append(pred_df_2)

# SELECT EVERY SECOND ROW
pred_df_1 = pred_df_1.drop(index=pred_df_1.index[1::2])
pred_df_2 = pred_df_2.drop(index=pred_df_2.index[::2])

# Combine the two subsets into a new DataFrame
new_df = pd.concat([pred_df_1, pred_df_2], ignore_index=True)
new_df['timestamp'] = pd.to_datetime(new_df['timestamp'])
final_df = new_df.sort_values(by='timestamp')
display(final_df.head(40))

final_df.to_csv('./output/bicikelj_test_oddaja.csv', index=False)

Unnamed: 0,timestamp,PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE,POGAČARJEV TRG-TRŽNICA,KONGRESNI TRG-ŠUBIČEVA ULICA,CANKARJEVA UL.-NAMA,BREG,GRUDNOVO NABREŽJE-KARLOVŠKA C.,MIKLOŠIČEV PARK,BAVARSKI DVOR,TRG OF-KOLODVORSKA UL.,...,POVŠETOVA - KAJUHOVA,SOSESKA NOVO BRDO,TEHNOLOŠKI PARK,VOJKOVA - GASILSKA BRIGADA,GERBIČEVA - ŠPORTNI PARK SVOBODA,DOLENJSKA C. - STRELIŠČE,ROŠKA - STRELIŠKA,LEK - VEROVŠKOVA,VOKA - SLOVENČEVA,SUPERNOVA LJUBLJANA - RUDNIK
0,2022-08-04 23:24:00,6.0,8.0,0.0,0.0,0.0,8.0,5.0,1.0,1.0,...,5.0,3.0,2.0,3.0,5.0,2.0,4.0,0.0,4.0,8.0
20,2022-08-05 00:24:00,4.0,7.0,1.0,1.0,0.0,5.0,4.0,1.0,1.0,...,6.0,4.0,3.0,4.0,8.0,2.0,4.0,1.0,5.0,8.0
1,2022-08-08 04:58:00,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,11.0,13.0,5.0,8.0,19.0,6.0,6.0,2.0,9.0,0.0
21,2022-08-08 05:58:00,1.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,2.0,...,12.0,13.0,5.0,8.0,19.0,6.0,6.0,2.0,9.0,0.0
2,2022-08-11 21:13:00,6.0,5.0,5.0,12.0,18.0,6.0,3.0,4.0,2.0,...,15.0,12.0,1.0,3.0,7.0,7.0,2.0,7.0,3.0,4.0
22,2022-08-11 22:13:00,4.0,4.0,3.0,8.0,14.0,4.0,3.0,4.0,2.0,...,15.0,12.0,1.0,4.0,8.0,7.0,3.0,6.0,4.0,4.0
3,2022-08-16 07:10:00,2.0,3.0,6.0,6.0,1.0,1.0,4.0,5.0,3.0,...,13.0,8.0,8.0,11.0,16.0,0.0,12.0,4.0,5.0,6.0
23,2022-08-16 08:10:00,4.0,6.0,8.0,9.0,3.0,2.0,5.0,8.0,4.0,...,12.0,6.0,9.0,9.0,14.0,0.0,11.0,6.0,3.0,6.0
4,2022-08-18 17:30:00,4.0,2.0,4.0,5.0,5.0,6.0,9.0,3.0,19.0,...,8.0,7.0,5.0,1.0,10.0,7.0,2.0,0.0,4.0,4.0
24,2022-08-18 18:27:00,6.0,4.0,5.0,8.0,5.0,6.0,8.0,5.0,17.0,...,8.0,7.0,5.0,2.0,10.0,8.0,2.0,1.0,5.0,4.0
