# Predicción de demanda con SVR

In [26]:
# Load Libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd 

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
import timeit

# Lectura de datos

- **ant**: antelación hasta el vuelo.

- **ruta**: trayecto de ida y vuelta ordenado alfabéticamente.
        
- **aeropuerto_origen**: aeropuerto donde despega el avión.

- **aeropuerto_destino**: aeropuerto donde aterriza el avión.
    
- **fecha_salida**: fecha de despegue del avión.
    
- **num_vuelo_operador**: identificador de número de vuelo.

- **month**: mes de vuelo.

- **weekday**: día de la semana de vuelo.
    
- **timezone**: franja horaria de vuelo.
    
- **year**: año de vuelo.
    
- **capacidad**: número máximo de pax en cada vuelo, puede variar en cada vuelo.
    
- **demand**: billetes vendidos en cada vuelo.
    
- **first**: código del principal competidor de la ruta del vuelo.
    
- **second**: código del segundo competidor de la ruta del vuelo.
    
- **first_ratio**: 
    
- **second_ratio**:

- **hit**:

- **first_weight**: peso de importancia del primer competidor en la ruta del vuelo respecto a I2.

- **second_weight**: peso de importancia del segundo competidor en la ruta del vuelo respecto a I2.

- **global_first_weight**: 

- **global_second_weight**: 

- **first_p**:

- **second_p**:

In [2]:
# Load the dataset 
datos = pd.read_csv("datasets/datos_pred_demanda.csv", sep=';', decimal=',')
datos.head()

Unnamed: 0,ant,ruta,aeropuerto_origen,aeropuerto_destino,fecha_salida,num_vuelo_operador,month,weekday,timezone,year,...,second,first_ratio,second_ratio,hit,first_weight,second_weight,global_first_ratio,global_second_ratio,first_p,second_p
0,11,ACEMAD,ACE,MAD,2017-02-02,3857,2,Thursday,Mediodia,2017,...,UX,3.14034,0.0,2,0.0,0.0,2.520623,0.0,0.623683,0.0
1,11,ACEMAD,ACE,MAD,2017-02-03,3857,2,Friday,Mediodia,2017,...,UX,3.61,0.0,2,0.0,0.0,2.210075,0.0,0.846071,0.0
2,11,ACEMAD,ACE,MAD,2017-02-04,3857,2,Saturday,Mediodia,2017,...,UX,0.0,3.252252,1,0.0,1.0,0.0,1.595346,0.0,0.981601
3,11,ACEMAD,ACE,MAD,2017-02-05,3857,2,Sunday,Mediodia,2017,...,UX,3.094286,3.252252,2,0.0,1.0,0.0,1.485467,0.658558,0.955942
4,11,ACEMAD,ACE,MAD,2017-02-06,3857,2,Monday,Mediodia,2017,...,UX,2.022857,0.0,2,0.0,0.0,2.648854,0.0,0.387667,0.0


In [3]:
datos.shape

(943794, 24)

In [4]:
datos["ruta"].unique()

array(['ACEMAD', 'LGWMAD', 'MADPMI', 'MADSCQ', 'MADTXL'], dtype=object)

In [5]:
datos = datos[datos.ant == 11]
datos.shape

(34242, 24)

In [6]:
datos = datos.sort_values(by=["fecha_salida"], ascending=True)

In [7]:
del datos["num_vuelo_operador"]

In [8]:
datos.dtypes

ant                      int64
ruta                    object
aeropuerto_origen       object
aeropuerto_destino      object
fecha_salida            object
month                    int64
weekday                 object
timezone                object
year                     int64
capacidad                int64
demand                   int64
nombre_blackout         object
first                   object
second                  object
first_ratio            float64
second_ratio           float64
hit                      int64
first_weight           float64
second_weight          float64
global_first_ratio     float64
global_second_ratio    float64
first_p                float64
second_p               float64
dtype: object

In [14]:
categorical_vars = ['ruta', 'aeropuerto_origen', 'aeropuerto_destino', 'month', 'weekday', 'year', 
                    'nombre_blackout', 'first', 'second', 'timezone']

categorical_vars

['ruta',
 'aeropuerto_origen',
 'aeropuerto_destino',
 'month',
 'weekday',
 'year',
 'nombre_blackout',
 'first',
 'second',
 'timezone']

In [15]:
numerical_vars = list(set(datos.columns) - set(categorical_vars))
numerical_vars

['global_first_ratio',
 'ant',
 'global_second_ratio',
 'fecha_salida',
 'hit',
 'second_ratio',
 'second_p',
 'demand',
 'capacidad',
 'first_weight',
 'first_p',
 'first_ratio',
 'second_weight']

In [16]:
from sklearn.preprocessing import OneHotEncoder

In [17]:
ohe = OneHotEncoder(sparse = False)
ohe_fit = ohe.fit(datos[categorical_vars])
X_ohe = pd.DataFrame(ohe.fit_transform(datos[categorical_vars]))
X_ohe.columns = pd.DataFrame(ohe_fit.get_feature_names())

In [20]:
X_ohe.head()

Unnamed: 0,"(x0_ACEMAD,)","(x0_LGWMAD,)","(x0_MADPMI,)","(x0_MADSCQ,)","(x0_MADTXL,)","(x1_ACE,)","(x1_LGW,)","(x1_MAD,)","(x1_PMI,)","(x1_SCQ,)",...,"(x8_D8,)","(x8_UNKNOWN,)","(x8_UX,)","(x9_Manana,)","(x9_Matutina,)","(x9_Mediodia,)","(x9_Noche,)","(x9_Nocturna,)","(x9_Tarde,)","(x9_Tarde_Noche,)"
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [21]:
X = pd.concat((X_ohe, datos[numerical_vars].reset_index()), axis=1)

In [22]:
X.shape

(34242, 73)

In [24]:
X.columns[X.dtypes == object]

Index(['fecha_salida'], dtype='object')

In [25]:
fecha_salida_values = X['fecha_salida']
y = X['demand']
del X['fecha_salida']

In [27]:
sca = StandardScaler()
X_sca = sca.fit_transform(X)

In [30]:
datos = X_sca
X['fecha_salida'] = fecha_salida_values
X['demand'] = y

In [31]:
perc_values = [0.7, 0.15, 0.15];

In [32]:
y = X['demand']
X = X.drop('demand', axis=1)

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train_rand, X_valtest_rand, y_train_rand, y_valtest_rand = train_test_split(X, y, test_size=perc_values[1] + perc_values[2], random_state=1);

X_val_rand, X_test_rand, y_val_rand, y_test_rand = train_test_split(X_valtest_rand, y_valtest_rand, test_size= perc_values[2] / (perc_values[1] + perc_values[2]), random_state=1)

In [35]:
# dimensiones de los conjuntos de train y test
n_train = int(X.shape[0] * perc_values[0])
n_val = int(X.shape[0] * perc_values[1])
n_test = int(X.shape[0] * perc_values[2])

# selección del conjunto de train
X_train = X.iloc[:n_train]
y_train = y.iloc[:n_train]

# selección del conjunto de validación
X_val = X.iloc[(n_train):(n_train+n_val)]
y_val = y.iloc[(n_train):(n_train+n_val)]

# selección del conjunto de test
X_test = X.iloc[(n_train+n_val):]
y_test = y.iloc[(n_train+n_val):]

In [37]:
del X_train['fecha_salida']
del X_val['fecha_salida']
del X_test['fecha_salida']

In [38]:
from sklearn import svm

In [39]:
model = svm.SVR(kernel='rbf')

In [41]:
model.fit(X_train, y_train)

SVR()

In [42]:
pred_train = model.predict(X_train)
pred_val = model.predict(X_val)
pred_test = model.predict(X_test)

In [43]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse

In [44]:
mae_train = mae(y_train, pred_train);
mae_val = mae(y_val, pred_val);
mae_test = mae(y_test, pred_test);

mse_train = mse(y_train, pred_train);
mse_val = mse(y_val, pred_val);
mse_test = mse(y_test, pred_test);

In [45]:
print('Train: MAE = ' + str(mae_train) + ' - MSE = '  + str(mse_train))
print('Validation: MAE = ' + str(mae_val) + ' - MSE = '  + str(mse_val))
print('Test: MAE = ' + str(mae_test) + ' - MSE = '  + str(mse_test))

Train: MAE = 25.36409235071978 - MSE = 1075.8039281150598
Validation: MAE = 22.522499203986698 - MSE = 830.6228464762571
Test: MAE = 26.829238245390673 - MSE = 1216.9320245629806


In [46]:
def mape(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred)/y_pred) * 100

In [47]:
print('Train: MAPE = ' + str(mape(y_train, pred_train)))
print('Validation: MAPE = ' + str(mape(y_val, pred_val)))
print('Test: MAPE = ' + str(mape(y_test, pred_test)))

Train: MAPE = 18.98355363519623
Validation: MAPE = 16.885448452897815
Test: MAPE = 20.13704125640554


In [48]:
param_grid = [
  {'C': [0.1, 1], 'gamma': [0.01],'epsilon': [1],  'kernel': ['rbf']}
 ]