In [1]:
import datetime, warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso,LinearRegression,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,BaggingRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

# Wczytanie danych 

In [16]:
flights=pd.read_csv("flights_explored.csv")

# Przygodowanie modeli

In [4]:
Las = Lasso()
LinR = LinearRegression()
Rid = Ridge()
Rfc = RandomForestRegressor(random_state=2)
Dtc = DecisionTreeRegressor(random_state = 2)
Boost_Lin = AdaBoostRegressor(base_estimator=LinR,random_state=2)
Boost_las = AdaBoostRegressor(base_estimator=Las,random_state=2)
Boost_rid = AdaBoostRegressor(base_estimator=Rid,random_state=2)
Bg_Lin = BaggingRegressor(base_estimator=LinR,random_state=2)
Bg_las = BaggingRegressor(base_estimator=Las,random_state=2)
Bg_rid = BaggingRegressor(base_estimator=Rid,random_state=2)

# Przygotowanie danych do modeli

In [17]:
le = LabelEncoder()

In [18]:
flights['AIRLINE']= le.fit_transform(flights['AIRLINE'])
flights['ORIGIN_AIRPORT'] = le.fit_transform(flights['ORIGIN_AIRPORT'])
flights['DESTINATION_AIRPORT'] = le.fit_transform(flights['DESTINATION_AIRPORT'])
#flights['DAY'] = le.fit_transform(flights['DAY'])

In [7]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3026542 entries, 0 to 3026541
Data columns (total 17 columns):
MONTH                  int64
DAY_OF_WEEK            int64
TAIL_NUMBER            object
ORIGIN_AIRPORT         int64
DESTINATION_AIRPORT    int64
SCHEDULED_DEPARTURE    object
DEPARTURE_TIME         object
DEPARTURE_DELAY        float64
SCHEDULED_TIME         float64
ELAPSED_TIME           float64
AIR_TIME               float64
DISTANCE               int64
SCHEDULED_ARRIVAL      object
ARRIVAL_TIME           object
ARRIVAL_DELAY          float64
AIRLINE                int64
DELAYED                bool
dtypes: bool(1), float64(5), int64(6), object(5)
memory usage: 372.3+ MB


In [19]:
flights = flights.drop(['SCHEDULED_DEPARTURE','SCHEDULED_ARRIVAL'], axis = 1)

In [9]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3026542 entries, 0 to 3026541
Data columns (total 15 columns):
MONTH                  int64
DAY_OF_WEEK            int64
TAIL_NUMBER            object
ORIGIN_AIRPORT         int64
DESTINATION_AIRPORT    int64
DEPARTURE_TIME         object
DEPARTURE_DELAY        float64
SCHEDULED_TIME         float64
ELAPSED_TIME           float64
AIR_TIME               float64
DISTANCE               int64
ARRIVAL_TIME           object
ARRIVAL_DELAY          float64
AIRLINE                int64
DELAYED                bool
dtypes: bool(1), float64(5), int64(6), object(3)
memory usage: 326.2+ MB


## Dane do trenowania modeli bez uwzgłędniania DEPARTURE_DELAY

In [20]:
X = flights.drop(['ARRIVAL_DELAY','TAIL_NUMBER','DEPARTURE_TIME','ARRIVAL_TIME'],axis = 1)
X.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DEPARTURE_DELAY,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,AIRLINE,DELAYED
0,1,4,313,571,-11.0,205.0,194.0,169.0,1448,0,False
1,1,4,573,311,-1.0,235.0,215.0,199.0,1448,0,False
2,1,4,313,571,-4.0,204.0,194.0,173.0,1448,0,False
3,1,4,313,530,-4.0,215.0,201.0,187.0,1542,0,False
4,1,4,404,571,-8.0,213.0,218.0,186.0,1533,0,False


In [21]:
y = flights['ARRIVAL_DELAY']
y.head()

0   -22.0
1   -21.0
2   -14.0
3   -18.0
4    -3.0
Name: ARRIVAL_DELAY, dtype: float64

In [22]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 2)

In [23]:
sc1=StandardScaler()
X_train_sc=sc1.fit_transform(X_train)
X_test_sc=sc1.transform(X_test)

## Dane do końcowego testu wybranego modelu uwzgłędniając DEPARTURE_DELAY

In [None]:
X_end = flights.drop(['ARRIVAL_DELAY','DEPARTURE_DELAY','TAIL_NUMBER','DEPARTURE_TIME','ARRIVAL_TIME'],axis = 1)
y_end = flights['ARRIVAL_DELAY']
X_train_end,X_test_end,y_train_end,y_test_end = train_test_split(X,y,test_size=0.3,random_state = 2)

In [None]:
X_train_sc_end=sc1.fit_transform(X_train)
X_test_sc_end=sc1.transform(X_test)

# Podstawowe regressory

In [None]:
for model, name in zip([Las,LinR,Rid,Dtc,Rfc], 
     ['Lasso','Linear Regression','Ridge','Random forest Regressor','Decision Tree Regressor']):
    model1 = model.fit(X_train_sc,y_train)
    Y_predict=model1.predict(X_test_sc)
    print(name)
    print('Mean Absolute Error:', mean_absolute_error(y_test, Y_predict))  
    print('Mean Squared Error:', mean_squared_error(y_test, Y_predict))  
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, Y_predict)))
    print('R2 : ',r2_score(y_test, Y_predict))
    print()


# Boosting

In [14]:
for model, name in zip([Boost_Lin,Boost_las,Boost_rid], 
     ['Boosted Linear','Boosted Lasso','Boosted Ridge']):
    model1 = model.fit(X_train_sc,y_train)
    Y_predict=model1.predict(X_test_sc)
    print(name)
    print('Mean Absolute Error:', mean_absolute_error(y_test, Y_predict))  
    print('Mean Squared Error:', mean_squared_error(y_test, Y_predict))  
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, Y_predict)))
    print('R2 : ',r2_score(y_test, Y_predict))
    print()

Lasso
Mean Absolute Error: 15.995184895038294
Mean Squared Error: 1152.186532701256
Root Mean Squared Error: 33.94387327193607
R2 :  0.2983182998897207

Linear Regression
Mean Absolute Error: 15.283811713379887
Mean Squared Error: 1127.2523119569842
Root Mean Squared Error: 33.574578358588276
R2 :  0.3135032425237465

Ridge
Mean Absolute Error: 15.28382683932877
Mean Squared Error: 1127.2522844988994
Root Mean Squared Error: 33.57457794967644
R2 :  0.31350325924572253

Random forest Regressor
Mean Absolute Error: 18.265640560243092
Mean Squared Error: 2187.7717014092723
Root Mean Squared Error: 46.77362185472996
R2 :  -0.3323531592571123





Decision Tree Regressor
Mean Absolute Error: 14.640164665955812
Mean Squared Error: 1190.8317930937176
Root Mean Squared Error: 34.50843075385662
R2 :  0.27478333289977264

Boosted Linear
Mean Absolute Error: 93.64773674741276
Mean Squared Error: 13977.838442142132
Root Mean Squared Error: 118.22790889693572
R2 :  -7.512504844987777



KeyboardInterrupt: 

# Bagging

In [1]:
for model, name in zip([Bg_Lin,Bg_las,Bg_rid], 
     ['Bagged Linear','Bagged Lasso','Bagged Ridge']):
    model1 = model.fit(X_train_sc,y_train)
    Y_predict=model1.predict(X_test_sc)
    print(name)
    print('Mean Absolute Error:', mean_absolute_error(y_test, Y_predict))  
    print('Mean Squared Error:', mean_squared_error(y_test, Y_predict))  
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, Y_predict)))
    print('R2 : ',r2_score(y_test, Y_predict))
    print()


NameError: name 'Bg_Lin' is not defined

# Końcowy wybór modelu regresyjnego

## Testowanie dla danych nie zawierających 

In [None]:
model="model"
nazwa='nazwa modelu'
model1 = model.fit(X_train_sc,y_train)
Y_predict=model1.predict(X_test_sc)
print(name)
print('Mean Absolute Error:', mean_absolute_error(y_test, Y_predict))  
print('Mean Squared Error:', mean_squared_error(y_test, Y_predict))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, Y_predict)))
print('R2 : ',r2_score(y_test, Y_predict))
print()
