In [22]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder

In [23]:
df = pd.read_csv('flights_encoded.csv')

In [24]:
df = df.drop(['origin_city_name', 'dest_city_name', 'year'], axis = 1)

In [25]:
def categorize(df, column):
    df[column] = df[column].astype('category')
    return df[column]

In [26]:
df['scheduled_flight_hour_of_day'] = categorize(df, 'scheduled_flight_hour_of_day')
df['day_of_the_week'] = categorize(df, 'day_of_the_week')
df['origin_conditions'] = categorize(df, 'origin_conditions')
df['dest_conditions'] = categorize(df, 'dest_conditions')
df['holiday'] = categorize(df, 'holiday')
df['month'] = categorize(df, 'month')

In [27]:
y =df['arr_delay']
col_names = ['scheduled_flight_hour_of_day', 'day_of_the_week','origin_conditions','dest_conditions','holiday', 'month']
cat_vars_final = df.select_dtypes(['object','category'])
cat_vars_final = cat_vars_final.loc[:, col_names]

#One Hot Encoder

enc = OneHotEncoder().fit(cat_vars_final)

cat_vars_ohe_final = enc.transform(cat_vars_final).toarray()
cat_vars_ohe_final = pd.DataFrame(cat_vars_ohe_final, index= cat_vars_final.index, 
                      columns=enc.get_feature_names(cat_vars_final.columns.tolist()))

In [28]:
df = df.drop(col_names, axis = 1)

In [29]:
df

Unnamed: 0,distance,origin_wind_speed,origin_visibility,dest_wind_speed,dest_visibility,arr_delay
0,1620.0,14.8,9.9,12.3,9.1,-36.0
1,641.0,15.0,9.9,12.3,9.1,-21.0
2,1235.0,15.0,9.9,5.6,4.2,-14.0
3,1199.0,12.3,9.1,14.6,9.9,-23.0
4,1750.0,5.6,4.2,12.7,9.9,-18.0
...,...,...,...,...,...,...
204671,641.0,11.8,9.9,6.1,9.9,-15.0
204672,601.0,25.3,7.9,15.3,8.7,37.0
204673,544.0,19.3,9.6,13.2,9.9,-28.0
204674,612.0,15.3,8.7,25.3,7.9,-8.0


In [30]:
cat_vars_ohe_final[['distance','origin_wind_speed','origin_visibility','dest_wind_speed','dest_visibility','arr_delay']] = df

In [31]:
X = cat_vars_ohe_final.drop(['arr_delay'], axis = 1)
y = cat_vars_ohe_final['arr_delay']

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.7, random_state = 5)

In [34]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

In [36]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree' : [3,8]}
svr = SVR()

In [None]:
# define the grid search
svr_reg= GridSearchCV(svr, param_grid ,cv=5)

#fit the grid search
svr_reg.fit(X_train,y_train)

# best estimator
print(svr_reg.best_estimator_)

In [None]:
# best model
best_model = svr_reg.best_estimator_
best_model.fit(X_train,y_train)

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
r_2_score = r2_score(y_test, y_pred)
print(r_2_score)