In [2]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder

In [3]:
df = pd.read_csv('flights_encoded.csv')

In [4]:
df = df[(df['arr_delay'] > -56) & (df['arr_delay'] < 213)]

In [5]:
df = df.drop(['origin_city_name', 'dest_city_name', 'year'], axis = 1)

In [6]:
def categorize(df, column):
    df[column] = df[column].astype('category')
    return df[column]

In [7]:
df['scheduled_flight_hour_of_day'] = categorize(df, 'scheduled_flight_hour_of_day')
df['day_of_the_week'] = categorize(df, 'day_of_the_week')
df['origin_conditions'] = categorize(df, 'origin_conditions')
df['dest_conditions'] = categorize(df, 'dest_conditions')
df['holiday'] = categorize(df, 'holiday')
df['month'] = categorize(df, 'month')

In [8]:
y =df['arr_delay']
col_names = ['scheduled_flight_hour_of_day', 'day_of_the_week','origin_conditions','dest_conditions','holiday', 'month']
cat_vars_final = df.select_dtypes(['object','category'])
cat_vars_final = cat_vars_final.loc[:, col_names]

#One Hot Encoder

enc = OneHotEncoder().fit(cat_vars_final)

cat_vars_ohe_final = enc.transform(cat_vars_final).toarray()
cat_vars_ohe_final = pd.DataFrame(cat_vars_ohe_final, index= cat_vars_final.index, 
                      columns=enc.get_feature_names(cat_vars_final.columns.tolist()))

In [9]:
df = df.drop(col_names, axis = 1)

In [10]:
cat_vars_ohe_final[['distance','origin_wind_speed','origin_visibility','dest_wind_speed','dest_visibility','arr_delay']] = df

In [11]:
X = cat_vars_ohe_final.drop(['arr_delay'], axis = 1)
y = cat_vars_ohe_final['arr_delay']

In [12]:
X.shape

(203016, 57)

In [13]:
y.shape

(203016,)

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.7, random_state = 5)

In [15]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

In [16]:
X_train.shape[0] != y.shape[0]

True

In [17]:
from sklearn.linear_model import Lasso
lassoreg = Lasso()

In [18]:
lassoreg.fit(X_train,y_train)

Lasso()

In [19]:
y_pred = lassoreg.predict(X_test)

In [20]:
from sklearn.metrics import r2_score
r_2_score = r2_score(y_test, y_pred)
print(r_2_score)

0.014969907477738409


In [27]:
from sklearn.linear_model import Ridge

In [28]:
ridgereg = Ridge()

In [30]:
ridgereg.fit(X_train,y_train)

Ridge()

In [31]:
y_pred = ridgereg.predict(X_test)

In [32]:
from sklearn.metrics import r2_score
r_2_score = r2_score(y_test, y_pred)
print(r_2_score)

0.0675358455069367


In [34]:
ridge_reg = Ridge()
from sklearn.model_selection import GridSearchCV
params_Ridge = {'alpha': [1,0.1,0.01,0.001,0.0001,0] , "fit_intercept": [True, False], "solver": ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
Ridge_GS = GridSearchCV(ridge_reg, param_grid=params_Ridge)
Ridge_GS.fit(X_train,y_train)
Ridge_GS.best_params_

{'alpha': 1, 'fit_intercept': True, 'solver': 'lsqr'}

In [None]:
best_model = Ridge_GS.best_estimator_
Ridge_GS.fit(X_train,y_train)

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
r_2_score = r2_score(y_test, y_pred)
print(r_2_score)