In [None]:
# %matplotlib widget
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, precision_recall_curve, PrecisionRecallDisplay, classification_report
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, precision_score, classification_report

### Reading the solar-potential dataset


In [None]:
read_full = pd.read_csv("solar-potential.csv")
read_full = read_full[read_full.Generation <= 4000]
read_full = read_full.drop("OBJECTID", axis=1)

print(read_full.head())
print(read_full.count())


### Histogram of  Relevant Attributes

In [None]:
read_full.hist(column=["Insolation","Area","Generation"],bins=200)
# ax = read_full.hist(column=["Insolation"], bins=200)
# plt.xlabel("Insolation Irradiance kWh/m2/year")
# plt.ylabel("Frequency")
# plt.title("Insolation Histogram")
plt.show()

### Splitting data for different models 

In [None]:
X = read_full[["Insolation", "Area"]].copy()
y = read_full["Suitability"].copy()
# scaleX = StandardScaler().fit(X)
# X = scaleX.transform(X)
train_x, test_x, train_y, test_y = train_test_split(X,y, test_size=0.2)



X_gen = read_full[["Insolation", "Area"]].copy()
y_gen = read_full["Generation"].copy()
# scaleX_gen = StandardScaler().fit(X_gen)
# X_gen = scaleX_gen.transform(X_gen)
X_gen_train, X_gen_test, y_gen_train, y_gen_test = train_test_split(X_gen,y_gen, test_size=0.2)



# X_corr = read_full["geo_point_2d"].str.split(", ", expand = True)
buff = read_full['Insolation']
buff = buff.rename("Insolation/m2")
print(buff.head())
X_corr = pd.concat([read_full["Latitude"],read_full["Longitude"],buff],axis=1)
X_corr = X_corr.rename(columns={"Latitude":"Lat",1:"Longitude",'':"Insolation/m2"})
print(X_corr)
y_corr = read_full["Generation"]/read_full["Area"]
scaleX_corr = StandardScaler().fit(X_corr)
X_corr = scaleX_corr.transform(X_corr)
X_corr_train, X_corr_test, y_corr_train, y_corr_test = train_test_split(X_corr,y_corr, test_size=0.2)



### Box Plot for Relevant Attributes

In [None]:
fig1 = plt.figure(figsize =(5, 3))
fig2 = plt.figure(figsize =(5, 3))
fig3 = plt.figure(figsize =(5, 3))
 
# Creating plot
ax1 = fig1.add_subplot()
ax2 = fig2.add_subplot()
ax3 = fig3.add_subplot()

bp1 = ax1.boxplot(read_full["Area"], patch_artist = True, notch = 'True', vert = 0, )
bp2 = ax2.boxplot(read_full["Insolation"], patch_artist = True, notch = 'True', vert = 0)
bp3 = ax3.boxplot(read_full["Generation"], patch_artist = True, notch = 'True', vert = 0)

ax1.title.set_text('Area') 
ax2.title.set_text('Insolation') 
ax3.title.set_text('Generation') 
# show plot
plt.show()

### Scatter Plot for Insolation vs Area

In [None]:
fig, ax = plt.subplots(figsize = (18,10))

ax.scatter(read_full["Insolation"], read_full["Area"])
plt.xlabel("Insolation")
plt.ylabel("Area")

plt.show()

### Code for Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(n_estimators=50, max_features="sqrt", max_samples=0.7)

# Fit RFC and predict using the testing set
rfc.fit(train_x, train_y)
pred2 = rfc.predict(test_x)

# Performance Report of rfc
print(f"Accuracy Score of Random Forest Classifier: {accuracy_score(pred2,test_y)*100}%")
cm = confusion_matrix(test_y, pred2, labels = rfc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rfc.classes_)
disp.plot()
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
# rfc_disp = RocCurveDisplay.from_estimator(rfc, test_x, test_y, ax=ax1)
# prec, recall, _ = precision_recall_curve(test_y, pred2, pos_label=rfc.classes_[1])
# pr_display = PrecisionRecallDisplay(precision=prec, recall=recall)

print(classification_report(test_y, pred2, target_names=["suitable", "well suitable", "excellent suitable"]))

### Visualisation of an Individual Tree in RFC

In [None]:
from sklearn import tree

fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (6,6), dpi=800)
tree.plot_tree(rfc.estimators_[0],
               feature_names = ["Insolation","Area"], 
               class_names=["suitable","well suitable", "excellent suitable"],
               filled = True)
# fig.savefig('rf_individualtree.png')

### Code for first SVR

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

logmodel = LinearRegression()

pipe = make_pipeline(StandardScaler(), logmodel)
pipe.fit(X_gen_train,y_gen_train)  # apply the pipeline on training data
print(f"accuracy score:{pipe.score(X_gen_test,y_gen_test)}")

### Code to Display Regression Line

In [None]:
x_surf, y_surf = np.meshgrid(np.linspace(read_full.Insolation.min(), read_full.Insolation.max(), 10), np.linspace(read_full.Area.min(), read_full.Area.max(), 10))
onlyX = pd.DataFrame({'Insolation':x_surf.ravel(), 'Area':y_surf.ravel()})
fittedY=pipe.predict(onlyX)
fittedY=np.array(fittedY)

print(f"Insolation max:{read_full.Insolation.max()}/min: {read_full.Insolation.min()}")
print(f"Area max:{read_full.Area.max()}/min: {read_full.Area.min()}")
print(f"Generation max:{read_full.Generation.max()}/min: {read_full.Generation.min()}")

# readfile["Generation"].plot.line()

from mpl_toolkits.mplot3d import Axes3D


fig = plt.figure(figsize=(20,10))
### Set figure size
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_gen_test['Insolation'],X_gen_test['Area'],y_gen_test,c='red', marker='o', alpha=0.5)
ax.plot_surface(x_surf,y_surf,fittedY.reshape(x_surf.shape), color='b', alpha=0.3)
ax.set_xlabel('Insolation')
ax.set_ylabel('Area')
ax.set_zlabel('Generation')
plt.show()

### Code for Second SVR

In [None]:
logmodel = LinearRegression()

pipe2 = make_pipeline(StandardScaler(), logmodel)
pipe2.fit(X_corr_train,y_corr_train)  # apply the pipeline on training data
print(f"accuracy score:{pipe2.score(X_corr_test,y_corr_test)}")

### Takes in geographical data as well as insolation and outputs predicted power generation as well as coordinates as csv

In [None]:
pred_pd = pd.read_csv("Uk_Insolation_data.csv")
pred_pd = pred_pd[["Lat","Long","Insolation"]]
print(pred_pd.head())
pred=pipe2.predict(pred_pd)
print(type(pred))
output = pd.DataFrame(pred)
print(pred)
pred_pd = pred_pd.drop('Insolation', axis=1)
output = pd.concat([pred_pd,output], axis=1)
output.to_csv("Uk_generation_output.csv")