In [4]:
import pandas as pd
import numpy as np
from api_local.ai_service import price_pred
from api_local.stats_service import city2city_info, state2state_info
from api_local.utils_service import load_data, query_type
import warnings

warnings.filterwarnings("ignore")

In [5]:
class Validation:
    def __init__(self):
        df_old = pd.read_csv('base_data_0.csv')
        df_new = pd.read_csv('new_data_0.csv')

        # Convertir a datetime
        df_old['inserted_at'] = pd.to_datetime(df_old['inserted_at'])
        df_new['inserted_at'] = pd.to_datetime(df_new['inserted_at'])

        df_new = self.__new_instances(df_old, df_new)
        self.results = self.__predict_price(df_new)

    def __new_instances(self, df_old, df_new):
        df_new = df_new[df_new['inserted_at'] > df_old['inserted_at'].max()]
        
        print(f"Last update in old dataset: {df_old['inserted_at'].max()}")
        print(f"Last update in new dataset: {df_new['inserted_at'].max()}")
        print(f"First update in new dataset: {df_new['inserted_at'].min()}")
        print(f"New instances in new dataset: {df_new.shape[0]}")

        s2s_route_new = df_new['s2s_route'].unique()
        print(f"Unique s2s_route in new dataset: {len(s2s_route_new)}")
        return df_new

    def __rmse(self,data):
        results = data.copy()
        
        #calcular diferencia porcentual entre precio original y precio predicho
        results['lr_diff'] = (results['lr_price'] - results['price_usd']) / results['price_usd']
        results['svr_diff'] = (results['svr_price'] - results['price_usd']) / results['price_usd']
        results['rnn_diff'] = (results['rnn_price'] - results['price_usd']) / results['price_usd']
        results['new_diff'] = (results['new_price'] - results['price_usd']) / results['price_usd']
        results['ensamble_diff'] = (results['ensamble_price'] - results['price_usd']) / results['price_usd']

        # calcular la raíz del error cuadratico medio
        results['lr_rmse'] = np.sqrt((results['lr_price'] - results['price_usd'])**2)
        results['svr_rmse'] = np.sqrt((results['svr_price'] - results['price_usd'])**2)
        results['rnn_rmse'] = np.sqrt((results['rnn_price'] - results['price_usd'])**2)
        results['new_rmse'] = np.sqrt((results['new_price'] - results['price_usd'])**2)
        results['ensamble_rmse'] = np.sqrt((results['ensamble_price'] - results['price_usd'])**2)

        return results

    def __predict_price(self, df_new):
        # Definir la función de predicción
        def predict_row(row):
            route_s2s = row['s2s_route']
            route_c2c = row['c2c_route']
            print(f"Predicting price for route {route_s2s}")

            segment_data, carrier_data = load_data(flag=0, broker_id=0)
            query = query_type(route_c2c, route_s2s, segment_data)

            lr_price, svr_price, rnn_price, ensamble_price , new_price= price_pred(
                broker_id=0, 
                route_s2s=route_s2s, 
                query=query, 
                distance=row['distance'], 
                month=row['month']
            )

            if query == 'city2city':
                c2c_stats = city2city_info(route_c2c, row['distance'], segment_data) # is a diccionary
                s2s_stats = state2state_info(route_s2s, row['distance'], segment_data)
            else:
                s2s_stats = state2state_info(route_s2s, row['distance'], segment_data)
                c2c_stats = {'mean_price': 0, 'range_min': 0, 'range_max': 0, 'count_segments': 0, 'price_mille_median': 0, 'last_trip': 0, 'last_price': 0, 'mean_price_ten': 0, 'price_mille_ten': 0}


            return pd.Series([lr_price, svr_price, rnn_price, ensamble_price, new_price,
                              c2c_stats['mean_price'], c2c_stats['range_min'], c2c_stats['range_max'], c2c_stats['count_segments'], c2c_stats['price_mille_median'], c2c_stats['last_trip'], c2c_stats['last_price'], c2c_stats['mean_price_ten'], c2c_stats['price_mille_ten'], 
                              s2s_stats['mean_price'], s2s_stats['range_min'], s2s_stats['range_max'], s2s_stats['count_segments'], s2s_stats['price_mille_median'], s2s_stats['last_trip'], s2s_stats['last_price'], s2s_stats['mean_price_ten'], s2s_stats['price_mille_ten']])

        # Aplicar la función a todas las filas y asignar correctamente todas las columnas
        df_new[[
            'lr_price', 'svr_price', 'rnn_price', 'ensamble_price','new_price',
            'c2c_mean_price', 'c2c_range_min', 'c2c_range_max', 'c2c_count_segments', 
            'c2c_price_mille_median', 'c2c_last_trip', 'c2c_last_price', 
            'c2c_mean_price_ten', 'c2c_price_mille_ten',
            's2s_mean_price', 's2s_range_min', 's2s_range_max', 's2s_count_segments',
            's2s_price_mille_median', 's2s_last_trip', 's2s_last_price', 
            's2s_mean_price_ten', 's2s_price_mille_ten'
        ]] = df_new.apply(predict_row, axis=1)

        # Calcular diferencias porcentuales evitando divisiones por cero
        for model in ['lr', 'svr', 'rnn', 'ensamble', 'new']:
            df_new[f'{model}_diff'] = np.where(
                df_new['price_usd'] == 0, 0, 
                (df_new[f'{model}_price'] - df_new['price_usd']) / df_new['price_usd']
            )

        df_new['s2s_price_diff'] = np.where(
            df_new['price_usd'] == 0, 
            0, 
            (df_new['s2s_mean_price'] - df_new['price_usd']) / df_new['price_usd']
        )

        results = self.__rmse(df_new)

        
        return results

In [6]:
val = Validation()

Last update in old dataset: 2025-01-16 22:32:45
Last update in new dataset: 2025-02-26 14:34:10
First update in new dataset: 2025-01-17 01:50:52
New instances in new dataset: 340
Unique s2s_route in new dataset: 37
Predicting price for route nuevo leon - texas


FileNotFoundError: [Errno 2] No such file or directory: '../api_local/segments_data.csv'

In [None]:
val.results.head()

In [5]:
#val.results.to_csv('validation_results.csv',index=False)

In [6]:
df = val.results.copy()

In [None]:
N = df.shape[0]

models = ['lr_diff', 'svr_diff', 'rnn_diff','ensamble_diff', 'new_diff']
labels = ['Linear Regression', 'Support Vector Machine', 'Recurrent Neural Network', 'Ensamble Model','RANSAC Regression']

for model,label in zip(models,labels):
    print('============================================')
    print(label)

    up_data = df[df[model]>0.1]

    value1 = up_data.shape[0]/N
    print(f'Porcentaje de rutas con error mayor a 0.1: {round(value1*100,2)}%')

    down_data = df[df[model]<-0.1]
    value2 = down_data.shape[0]/N
    print(f'Porcentaje de rutas con error menor a -0.1: {round(value2*100,2)}%')


    print(f'Porcentaje de rutas con error dentro del rango: {round(100-(value1*100+value2*100),2)}%')

In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.multicomp import pairwise_tukeyhsd

results = df[['price_usd','lr_diff', 'svr_diff', 'rnn_diff', 'ensamble_diff', 'new_diff', 's2s_price_diff']]

long_df = pd.melt(
    results,
    value_vars=['price_usd','lr_diff', 'svr_diff', 'rnn_diff', 'ensamble_diff', 'new_diff', 's2s_price_diff'],
    var_name='model',   # Nombre de la columna que contendrá el grupo/modelo
    value_name='diff'   # Nombre de la columna con valores de la diferencia
)

tukey = pairwise_tukeyhsd(
    endog=long_df['diff'],   # valores cuantitativos a comparar (diferencias)
    groups=long_df['model'], # factor de agrupamiento (modelo: lr, svr, etc.)
    alpha=0.05               # nivel de significancia
)

print(tukey)

