# Imports

In [None]:
#Import Data Exploration Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

# Data Gathering

In [None]:
file = "../Data/Capstone_Data.csv"

In [None]:
oil_df = pd.read_csv(file)

In [None]:
oil_df.head()

# Explore Data Set

In [None]:
oil_df.info()

In [None]:
oil_df.describe()

In [None]:
for col in oil_df.columns:
    plt.figure(figsize=(10,6))
    plt.hist(oil_df[col])
    plt.title(col, fontsize=16, fontweight="bold")
    plt.ylabel("Count")
    plt.xlabel(col)
    plt.show()

# Data Cleaning

In [None]:
#One Hot Encoding
dummies = pd.get_dummies(oil_df.Quarter)
oil_encode_df = pd.concat([oil_df, dummies], axis='columns')
oil_encode_df.head()

In [None]:
#Scaling
from sklearn.preprocessing import MinMaxScaler
x_scaler = MinMaxScaler()
features = oil_encode_df.drop(["US_Rigs", "Quarter"], axis=1)
x_scaler.fit(features)
features_scaled = x_scaler.transform(features)
df_scaled = pd.DataFrame(features_scaled, columns=features.columns)
df_scaled["US_Rigs"] = oil_encode_df["US_Rigs"]
df_scaled.head()

# Re-Inspect Data

In [None]:
oil_encode_df.info()

In [None]:
oil_encode_df.describe()

In [None]:
for col in df_scaled.columns:
    plt.figure(figsize=(10,6))
    plt.hist(df_scaled[col])
    plt.title(col, fontsize=16, fontweight="bold")
    plt.ylabel("Count")
    plt.xlabel(col)
    plt.show()

# Correlations

In [None]:
df_scaled.corr()

In [None]:
plt.figure(figsize=(18,12))
sns.heatmap(final_df.corr())
plt.show()

# Prep for Machine Learning Analysis

In [None]:
#Get Highly Correlated Features
corrs = abs(df_scaled.corr()["US_Rigs"]).sort_values()

predictive_cols = []
for name, col in corrs.iteritems():
    if col > .05:
        predictive_cols.append(name)
        
predictive_cols

In [None]:
#Create our feature and target sets
feature_cols = ['Mexico_S',
 'Other_S',
 'US_S',
 'Europe_D',
 'Soviet_D',
 'US_D',
 'Europe_Rigs',
 'Canada_S',
 'Japan_D',
 'China_D',
 'Other_S.1',
 'Asia_D',
 'Africa_Rigs',
 'Total_World_S',
 'Total_World_D',
 'Year',
 'China_S',
 'Middle_East_Rigs',
 'Other_D',
 'Canada_D',
 'North_Sea_S',
 'Canada_Rigs',
 'OPEC_Crude_Oil_Portion',
 'Soviet_S',
 'Total_OPEC',
 'OPEC_S',
 'OPEC_Non_Crude_Portion',
 'Total_Intl_Rigs',
 'Asia_Pacific_Rigs',
 'Brent_Crude_Price',
 'Latin_America_Rigs']

target_col = "US_Rigs"

features = df_scaled[feature_cols]
target = df_scaled[target_col]

In [None]:
#Train Test Split
from sklearn.model_selection import train_test_split
X = features.to_numpy()
y = target.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Machine Learning

In [None]:
#Imports
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error

# Linear Models

In [None]:
#Initialize Linear Regression Model
reg = LinearRegression()

# fit
reg.fit(X_train, y_train)

# predict
in_preds = reg.predict(X_train)
out_preds = reg.predict(X_test)

#evaluate
print("Model Evaluation Report")
print(f"The In Sample R2 Score: {r2_score(y_train, in_preds)}")
print(f"The In Sample MSE: {mean_squared_error(y_train, in_preds)}")
print()
print(f"The Out Sample R2 Score: {r2_score(y_test, out_preds)}")
print(f"The Out Sample MSE: {mean_squared_error(y_test, out_preds)}")

In [None]:
#Initialize Ridge Model
ridge = Ridge()

# fit
ridge.fit(X_train, y_train)

# predict
in_preds = ridge.predict(X_train)
out_preds = ridge.predict(X_test)

#evaluate
print("Model Evaluation Report")
print(f"The In Sample R2 Score: {r2_score(y_train, in_preds)}")
print(f"The In Sample MSE: {mean_squared_error(y_train, in_preds)}")
print()
print(f"The Out Sample R2 Score: {r2_score(y_test, out_preds)}")
print(f"The Out Sample MSE: {mean_squared_error(y_test, out_preds)}")

In [None]:
#Initialize Lasso Model
lasso = Lasso()

# fit
lasso.fit(X_train, y_train)

# predict
in_preds = lasso.predict(X_train)
out_preds = lasso.predict(X_test)

#evaluate
print("Model Evaluation Report")
print(f"The In Sample R2 Score: {r2_score(y_train, in_preds)}")
print(f"The In Sample MSE: {mean_squared_error(y_train, in_preds)}")
print()
print(f"The Out Sample R2 Score: {r2_score(y_test, out_preds)}")
print(f"The Out Sample MSE: {mean_squared_error(y_test, out_preds)}")

In [None]:
#Initialize ElasticNet Model
en = ElasticNet()

# fit
en.fit(X_train, y_train)

# predict
in_preds = en.predict(X_train)
out_preds = en.predict(X_test)

#evaluate
print("Model Evaluation Report")
print(f"The In Sample R2 Score: {r2_score(y_train, in_preds)}")
print(f"The In Sample MSE: {mean_squared_error(y_train, in_preds)}")
print()
print(f"The Out Sample R2 Score: {r2_score(y_test, out_preds)}")
print(f"The Out Sample MSE: {mean_squared_error(y_test, out_preds)}")

# Tree Models

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
#Initialize Decision Tree Model
dt = DecisionTreeRegressor()

# fit
dt.fit(X_train, y_train)

# predict
in_preds = dt.predict(X_train)
out_preds = dt.predict(X_test)

#evaluate
print("Model Evaluation Report")
print(f"The In Sample R2 Score: {r2_score(y_train, in_preds)}")
print(f"The In Sample RMSE: {np.sqrt(mean_squared_error(y_train, in_preds))}")
print()
print(f"The Out Sample R2 Score: {r2_score(y_test, out_preds)}")
print(f"The Out Sample MSE: {mean_squared_error(y_test, out_preds)}")

# make Plot
plt.figure(figsize=(10,6))
plt.scatter(y_test, out_preds)
plt.plot(y_test, y_test)
plt.xlabel("Actual")
plt.ylabel("Predictions")
plt.title("Actual vs Predicted", fontsize=18, fontweight="bold")
plt.show()

In [None]:
#Initialize Random Forest Model
rf = RandomForestRegressor()

# fit
rf.fit(X_train, y_train)

# predict
in_preds = rf.predict(X_train)
out_preds = rf.predict(X_test)

#evaluate
print("Model Evaluation Report")
print(f"The In Sample R2 Score: {r2_score(y_train, in_preds)}")
print(f"The In Sample RMSE: {np.sqrt(mean_squared_error(y_train, in_preds))}")
print()
print(f"The Out Sample R2 Score: {r2_score(y_test, out_preds)}")
print(f"The Out Sample MSE: {mean_squared_error(y_test, out_preds)}")

# make Plot
plt.figure(figsize=(10,6))
plt.scatter(y_test, out_preds)
plt.plot(y_test, y_test)
plt.xlabel("Actual")
plt.ylabel("Predictions")
plt.title("Actual vs Predicted", fontsize=18, fontweight="bold")
plt.show()

# Andrey, this is where we need to pick up the code again!

# Ada Boost Regressor

In [None]:
#initialize
ada = AdaBoostRegressor()

# fit
ada.fit(X_train, y_train)

# predict
in_preds = ada.predict(X_train)
out_preds = ada.predict(X_test)

#evaluate
print("Model Evaluation Report")
print(f"The In Sample R2 Score: {r2_score(y_train, in_preds)}")
print(f"The In Sample RMSE: {np.sqrt(mean_squared_error(y_train, in_preds))}")
print()
print(f"The Out Sample R2 Score: {r2_score(y_test, out_preds)}")
print(f"The Out Sample MSE: {mean_squared_error(y_test, out_preds)}")

# make Plot
plt.figure(figsize=(10,6))
plt.scatter(y_test, out_preds)
plt.plot(y_test, y_test)
plt.xlabel("Actual")
plt.ylabel("Predictions")
plt.title("Actual vs Predicted", fontsize=18, fontweight="bold")
plt.show()

# Gradient Boost Regressor

In [None]:
#initialize
gb = GradientBoostingRegressor()

# fit
gb.fit(X_train, y_train)

# predict
in_preds = gb.predict(X_train)
out_preds = gb.predict(X_test)

#evaluate
print("Model Evaluation Report")
print(f"The In Sample R2 Score: {r2_score(y_train, in_preds)}")
print(f"The In Sample RMSE: {np.sqrt(mean_squared_error(y_train, in_preds))}")
print()
print(f"The Out Sample R2 Score: {r2_score(y_test, out_preds)}")
print(f"The Out Sample MSE: {mean_squared_error(y_test, out_preds)}")

# make Plot
plt.figure(figsize=(10,6))
plt.scatter(y_test, out_preds)
plt.plot(y_test, y_test)
plt.xlabel("Actual")
plt.ylabel("Predictions")
plt.title("Actual vs Predicted", fontsize=18, fontweight="bold")
plt.show()

In [None]:
from sklearn import neighbors
from sklearn.metrics import mean_squared_error 
from math import sqrt

In [None]:
rmse_val = [] #to store rmse values for different k
for K in range(20):
    K = K+1
    model = neighbors.KNeighborsRegressor(n_neighbors = K)
    model.fit(X_train, y_train)  #fit the model
    pred=model.predict(X_test) #make prediction on test set
    error = sqrt(mean_squared_error(y_test,pred)) #calculate rmse
    rmse_val.append(error) #store rmse values
    print('RMSE value for k= ' , K , 'is:', error)

# KNN Model

In [None]:
# initialize
knn = KNeighborsClassifier(n_neighbors=20)

# fit
knn = knn.fit(X_train, y_train)

# predictions
in_preds = knn.predict(X_train)
out_preds = knn.predict(X_test)

# evaluation
print("Model Evaluation Report")
print(f"In Sample Classification Report: \n{classification_report(y_train, in_preds)}")
print(f"In Sample Confusion Matrix: \n{confusion_matrix(y_train, in_preds)}")

print()
print(f"Out Sample Classification Report: \n{classification_report(y_test, out_preds)}")
print(f"Out Sample Confusion Matrix: \n{confusion_matrix(y_test, out_preds)}")

# SVM Model

In [None]:
# initialize
svm = SVC()

# fit
svm = svm.fit(X_train, y_train)

# predictions
in_preds = svm.predict(X_train)
out_preds = svm.predict(X_test)

# evaluation
print("Model Evaluation Report")
print(f"In Sample Classification Report: \n{classification_report(y_train, in_preds)}")
print(f"In Sample Confusion Matrix: \n{confusion_matrix(y_train, in_preds)}")

print()
print(f"Out Sample Classification Report: \n{classification_report(y_test, out_preds)}")
print(f"Out Sample Confusion Matrix: \n{confusion_matrix(y_test, out_preds)}")

In [None]:
#Imports
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# from sklearn.metrics import plot_roc_curve