# Lightgbm

Dokumentacja: https://lightgbm.readthedocs.io/en/stable/

Zbiór danych https://www.kaggle.com/datasets/kukuroo3/flight-price-predict-competition-format

Cel: Przewidzenie ceny biletu lotniczego. Może być przydatny w kilku sytuacjach. Na przykład stworzenie aplikacji, gdzie użytkownicy będą sprawdzać jaka powinna być cena biletu lub kiedy kupić, aby cena była najkorzystniejsza. Może być tez użyta przez linie lotnicze do ustalania cen na podstawie rekomendacji modelu lub prawdopodobnych cen konkurencji.

In [2]:
# instalacja
#pip install lightgbm

In [1]:
import pandas as pd
import os
import lightgbm as lgb
from itertools import product
from sklearn.metrics import r2_score

In [2]:
# Przeniesienie do głównego folderu z "2_machine_learning"
os.chdir('../')

In [3]:
# puść ten kod, 
# jeżeli wywołujesz plik  w folderze rozwiąznaia, 
# a ramka danych znajduje się w folderze data
import os 
os.chdir('../')

In [4]:
# Załadowanie danych
train_x = pd.read_csv('data/flights_prices/X_train.csv')
train_y = pd.read_csv('data/flights_prices/y_train.csv')
test_x = pd.read_csv('data/flights_prices/X_test.csv')
test_y = pd.read_csv('data/flights_prices/y_test.csv')

In [4]:
# data preprocessing train
train_x['is_vistara'] = (train_x['airline']=='Vistara').astype(int)
train_x['is_zero_stops'] = (train_x['stops']=='zero').astype(int)
train_x.loc[train_x['departure_time']=='Late_Night','departure_time'] = 'Night'
train_x = pd.get_dummies(data=train_x, columns = ['departure_time','source_city','arrival_time','destination_city'], drop_first=True,dtype=int)


In [5]:
# data preprocessing test
test_x['is_vistara'] = (test_x['airline']=='Vistara').astype(int)
test_x['is_zero_stops'] = (test_x['stops']=='zero').astype(int)
test_x.loc[test_x['departure_time']=='Late_Night','departure_time'] = 'Night'
test_x = pd.get_dummies(data=test_x, columns = ['departure_time','source_city','arrival_time','destination_city'], drop_first=True,dtype=int)

## Modelowanie funkcją LGBMRegressor

In [6]:
# zmienne numeryczne i kategoryczne
numerical_features = ['duration', 'days_left']
cat_features = ['is_vistara', 'is_zero_stops',
       'departure_time_Early_Morning', 'departure_time_Evening',
       'departure_time_Morning', 'departure_time_Night', 'source_city_Chennai',
       'source_city_Delhi', 'source_city_Hyderabad', 'source_city_Kolkata',
       'source_city_Mumbai', 'arrival_time_Early_Morning',
       'arrival_time_Evening', 'arrival_time_Late_Night',
       'arrival_time_Morning', 'arrival_time_Night',
       'destination_city_Chennai', 'destination_city_Delhi',
       'destination_city_Hyderabad', 'destination_city_Kolkata',
       'destination_city_Mumbai']

In [None]:
# Estymacja modelu
model_1  = lgb.LGBMRegressor().fit(train_x[cat_features+numerical_features], train_y['price'],categorical_feature=cat_features)

In [None]:
# ważność zmiennych
model_1.feature_importances_


In [None]:
# zmienne wejsciowe
model_1.feature_names_in_


In [None]:
# predykcje
pred_train = model_1.predict(train_x[cat_features+numerical_features])
pred_train

## Modelowanie funkcją train

In [15]:
# użycie lgb.Dataset - optymalizacja przeliczeń. Można użyć tez starnardowego podejścia (ramki pandas)
train_lgb = lgb.Dataset(data=train_x[cat_features+numerical_features], label = train_y['price'],
                        categorical_feature=cat_features,free_raw_data=False)

In [None]:
# estymacja modelu
model_2 = lgb.train(train_set=train_lgb,params={})

In [None]:
# metody i atrybuty
model_2.feature_importance()

In [None]:
model_2.feature_name()

### Optymalizacja hiperparametrów i kroswalidacja

Zoptymalizujemy model z uwagi na objective, learning_rate oraz max_depth drzewa.

In [21]:
# lista parametrów
obj = ['regression','regression_l1','huber','mape','tweedie']
lr= [0.01,0.3,0.8]
max_depth = [3,10,15]

In [23]:
# funkcja ewaluacyjna
def lgb_r2(preds, eval_data):
    y_true = eval_data.get_label()
    r2 = r2_score(y_true, preds)
    return 'r2', r2, True

In [None]:
# iloczyn kartezjanski
product_of_all = product(obj,lr, max_depth)
r2 = []
params = []
models = []
for o,l,m in product_of_all:
    print(f'create model for {o}, lr: {l}, max depth: {m}')
    model_o = lgb.cv(train_set= train_lgb,
                     params={'objective': o,
                             'learning_rate': l,
                             'max_depth': m,
                             'num_leaves':100,
                             'n_estimators':200}
                    , nfold=3,
                    feval=lgb_r2,
                    return_cvbooster=True,
                    stratified=False)
    r2.append(model_o['valid r2-mean'][-1])
    params.append([o,l,m])
    models.append(model_o)

In [None]:
# znalezienie najlepszego modelu 
print(max(r2))

In [27]:
model_final = models[r2.index(max(r2))]

In [None]:
# najlepsze parametry
params[r2.index(max(r2))]

In [None]:
model_final

In [None]:
# booster
model_final['cvbooster']

In [None]:
# modele cv
model_final['cvbooster'].boosters

In [32]:
# predykcje
train_pred = pd.DataFrame()
test_pred  = pd.DataFrame()
for i in range(len(model_final['cvbooster'].boosters)):
    mod_i  = model_final['cvbooster'].boosters[i]
    train_pred[f'pred_cv_{i}'] = mod_i.predict(train_x[mod_i.feature_name()])
    test_pred[f'pred_cv_{i}'] = mod_i.predict(test_x[mod_i.feature_name()])

In [33]:
train_pred

Unnamed: 0,pred_cv_0,pred_cv_1,pred_cv_2
0,69208.289515,69259.137899,66773.936838
1,52391.045490,52281.602145,52449.758523
2,64801.014723,62778.532570,63884.708055
3,49173.766594,51376.623041,53391.314664
4,52082.311820,44751.486064,49032.182149
...,...,...,...
5693,61467.985888,58014.931168,61037.387167
5694,59663.373066,60748.255038,58942.717578
5695,51766.791679,51699.694693,51236.734207
5696,44338.389878,46511.177904,50258.328529


In [34]:
# finalna predykcja
train_pred['pred'] = train_pred.mean(axis=1)
test_pred['pred'] = test_pred.mean(axis=1)

In [35]:
# Ocena
r2_score(test_y['price'], test_pred['pred'])

0.7060464571993972