# Tehtävä 4: koneoppimisen ja syväoppimisen toteutus 

Tässä dokumentissa sovelletaan koneoppimisen ja syväoppimisen malleja aikaisemmin käsiteltyyn dataan. Tarkoituksena on ennustaa järjestelmien hintaa käyttäen regressio menetelmiä.

In [69]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# import plotly.express as px
# from pandas_profiling import ProfileReport
# from sklearn.impute import KNNImputer
import seaborn as sns
df = pd.read_csv("../data/base.csv", low_memory=False)


pd.options.display.max_columns = None
pd.set_option('display.float_format', str)
#pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [70]:
# Muutetaan päivänmäärä pelkästään vuodeksi, koska scalerit eivät toimi datatimen kanssa
df["installation_date"] = pd.to_datetime(df['installation_date']).dt.year

### Luetaan aikaisemmin käsitelty dataframe ja muodostetaan siitä erilaiset versiot

In [71]:
# Otetaan mukaan residential-järjestelmät, missä ei ole akkua
res = df[(df['residential'] == 1) & (df['battery'] == 0)]
# Otetaan mukaan residential-järjestelmät, missä on akku
res_bat = df[(df['residential'] == 1) & (df['battery'] == 1)]

# Otetaan mukaan non_residential-järjestelmät, missä ei ole akkua
non_res = df[(df['residential'] == 0) & (df['battery'] == 0)]
# Otetaan mukaan non_residential-järjestelmät, missä on akku
non_res_bat = df[(df['residential'] == 0) & (df['battery'] == 1)]


res = res.drop(columns=['battery', 'battery_rated_capacity_kW', 'residential'])
res_bat = res_bat.drop(columns=['battery', 'residential'])
non_res = non_res.drop(columns=['battery', 'battery_rated_capacity_kW', 'residential'])
non_res_bat = non_res_bat.drop(columns=['battery', 'residential'])

### Tehrävään valitut algoritmit

- LinearRegression. 

In [72]:
from math import sqrt
from pandas import MultiIndex, Int16Dtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.svm import LinearSVR
from sklearn.linear_model import SGDRegressor
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

### Algoritmien hyvyyden määrittämiseen käytettiin seuraavia metriikoita: 

RMSE = Root Mean Square Error. Keskineliövirheen neliöjuuri antaa normalisoidun etäisyyden ennustettujen arvojen vektorin ja pohjatotuutena olevan vektorin välillä 

R2 = R squared. R2-luku kertoo kuinka hyvin malli sovittuu regressiomalleissa opetusaineistoon. R2-luku kertoo kuinka suuren osuuden mallin muuttujat voivat selittää pohjatotuudesta. 

MAE = Mean absolute error.  Keskimääräinen absoluuttinen virhe antaa numeerisen arvon, joka kertoo kuinka paljon algoritmin tekemä arvio keskimäärin erosi pohjatotuudesta 

In [73]:
def evaluate_models(df,scaler):

    
    random_state = 42

    tulos = pd.DataFrame

    y = df["total_installed_price"]
    X = df.drop(columns="total_installed_price")

    features = X.columns

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_state) 

    tulos = pd.DataFrame(columns=["Model","Scaler","RMSE train","RMSE test","R2 train","R2 test","MAE train","MAE test"])
    
    
    if scaler != None:

        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=features)
        X_test = pd.DataFrame(scaler.transform(X_test), columns=features)


    models = {"LinearRegression":LinearRegression(),
            "Lasso":linear_model.Lasso(alpha=0.1),
            "Linear Support Vector Regression": LinearSVR(random_state=random_state),
            "Support Vector Regression":svm.SVR(),            
            "Stochastic Gradient Descent":SGDRegressor(),
            "Nearest Neighbors Regression":KNeighborsRegressor(),
            "Decision tree regressor":DecisionTreeRegressor(random_state=random_state),
            "RandomForest": RandomForestRegressor(max_depth=10, random_state=random_state),
            "Gradient Tree Boosting":GradientBoostingRegressor(random_state=random_state),
            "AdaBoost regressor": AdaBoostRegressor(random_state=random_state),
            "XGBoost Regressor": XGBRegressor(seed = random_state)
            }


    for name, model in models.items():
        

        model.fit(X_train,y_train)

        RMSE_test = sqrt(mean_squared_error(y_test, model.predict(X_test)))
        RMSE_train = sqrt(mean_squared_error(y_train, model.predict(X_train)))
        r2_test = r2_score(y_test, model.predict(X_test))
        r2_train = r2_score(y_train, model.predict(X_train))
        MAE_test = mean_absolute_error(y_test, model.predict(X_test))
        MAE_train = mean_absolute_error(y_train, model.predict(X_train))

        tulos.loc[len(tulos)] = [name,scaler,RMSE_train,RMSE_test,r2_train,r2_test,MAE_train,MAE_test]
        
    
    return tulos

In [74]:
evaluate_models(res_bat,None)



Unnamed: 0,Model,Scaler,RMSE train,RMSE test,R2 train,R2 test,MAE train,MAE test
0,LinearRegression,,12664.630903032848,41260.077543042345,0.5446183792204671,-1.54714078172923,6318.779740412509,7287.274278712932
1,Lasso,,12664.644033062044,41247.23689096143,0.544617434988121,-1.545555624302441,6318.376430042887,7286.120686144663
2,Linear Support Vector Regression,,17523.00220024056,21533.28174686895,0.1282188888609969,0.3062335157291781,10565.85232968856,10804.031656752344
3,Support Vector Regression,,18973.589589408257,26018.103825262333,-0.0220906808505256,-0.0128477191485167,11275.44252321072,11548.81012509722
4,Stochastic Gradient Descent,,9.363221736853105e+17,9.369756108365206e+17,-2.4890925251958533e+27,-1.3135585711623132e+27,9.352868985648714e+17,9.36254327683929e+17
5,Nearest Neighbors Regression,,14969.267097836811,25953.500361704075,0.363803172394369,-0.0078241221288744,8808.198283337411,11425.739320417288
6,Decision tree regressor,,170.08319975026623,22727.77497140709,0.9999178678372423,0.2271295350539083,3.941277220455131,8135.162349726776
7,RandomForest,,7158.134375920529,19211.786489539183,0.8545244131617945,0.4477593487584801,4127.554895578291,6276.000604278814
8,Gradient Tree Boosting,,8603.423088287669,18784.06897320804,0.7898482624292056,0.4720750127189025,5103.0473536810105,6364.658719550396
9,AdaBoost regressor,,15214.364311921996,21829.496163504347,0.3427992556772288,0.2870151631137034,12007.627785691811,12211.656193616953


In [75]:
evaluate_models(res_bat,StandardScaler())

Unnamed: 0,Model,Scaler,RMSE train,RMSE test,R2 train,R2 test,MAE train,MAE test
0,LinearRegression,StandardScaler(),12664.630903032845,41260.07754304045,0.5446183792204673,-1.5471407817289946,6318.7797404124685,7287.27427871284
1,Lasso,StandardScaler(),12664.6310194253,41255.145817059965,0.5446183708502298,-1.5465319098908863,6318.660136950461,7287.014643991295
2,Linear Support Vector Regression,StandardScaler(),38600.178344870576,42621.80090449615,-3.230279341632452,-1.7180438679011942,33747.11276013513,33883.72950668921
3,Support Vector Regression,StandardScaler(),18933.729172767,25990.203239400504,-0.0178007002818889,-0.0106766237366293,11228.988367294252,11503.482105427674
4,Stochastic Gradient Descent,StandardScaler(),12710.755009536038,36650.26764433404,0.5412953738281604,-1.009773754966118,6416.426676185438,7277.8878452895615
5,Nearest Neighbors Regression,StandardScaler(),11367.994826371549,20283.79327751259,0.6330911742967542,0.3844104749906846,5436.1645172498165,7152.758594138103
6,Decision tree regressor,StandardScaler(),170.08319975026623,22709.940311893257,0.9999178678372423,0.2283420139216451,3.941277220455131,8121.393785394933
7,RandomForest,StandardScaler(),7142.481673653221,19258.013621501043,0.8551599408775072,0.4450985639704384,4124.155905879158,6276.552552787107
8,Gradient Tree Boosting,StandardScaler(),8603.423088287669,19015.405117004702,0.7898482624292056,0.4589915675391076,5103.0473536810105,6367.849783875502
9,AdaBoost regressor,StandardScaler(),14472.171752484745,21505.5524382579,0.4053549063249484,0.3080191495771895,11043.65674111451,11396.233809183666


In [76]:
evaluate_models(res_bat,RobustScaler())



Unnamed: 0,Model,Scaler,RMSE train,RMSE test,R2 train,R2 test,MAE train,MAE test
0,LinearRegression,RobustScaler(),12664.630903032848,41260.07754303712,0.5446183792204672,-1.5471407817285838,6318.779740412598,7287.274278712971
1,Lasso,RobustScaler(),12664.63138709237,41256.9847162601,0.5446183444098491,-1.5467589322302795,6318.671508415877,7286.91414978881
2,Linear Support Vector Regression,RobustScaler(),30464.324380265734,41061.485377344325,-1.6349569618060729,-1.5226800982051802,26121.081081964698,26787.97111330994
3,Support Vector Regression,RobustScaler(),18972.9269381197,26017.666391938543,-0.0220192892062247,-0.0128136621224703,11274.853826364428,11548.34955153574
4,Stochastic Gradient Descent,RobustScaler(),435856264777597.6,342041397697959.1,-5.393578566558718e+20,-1.750452033209689e+20,71633529910627.05,59625729018295.75
5,Nearest Neighbors Regression,RobustScaler(),11896.816928183718,21226.26178438575,0.5981610955498442,0.325875824976498,5573.742353315391,7317.617392945852
6,Decision tree regressor,RobustScaler(),170.08319975026623,22722.670499080134,0.9999178678372423,0.2274766568366141,3.941277220455131,8129.0580228514655
7,RandomForest,RobustScaler(),7157.615391061493,19225.06609750444,0.854545507174842,0.4469956432169325,4127.20260011967,6273.831194273174
8,Gradient Tree Boosting,RobustScaler(),8603.423088287669,18782.112968775968,0.7898482624292056,0.4721849537388503,5103.0473536810105,6361.209866611232
9,AdaBoost regressor,RobustScaler(),14202.62332435454,21235.355541197085,0.4272995003323353,0.3252980845767077,10789.87816713308,11092.341215720406


In [77]:
evaluate_models(res_bat,MinMaxScaler())

Unnamed: 0,Model,Scaler,RMSE train,RMSE test,R2 train,R2 test,MAE train,MAE test
0,LinearRegression,MinMaxScaler(),12664.63090303285,41260.07754304043,0.5446183792204671,-1.5471407817289935,6318.7797404124685,7287.274278712841
1,Lasso,MinMaxScaler(),12664.6597111398,41173.01602058356,0.5446163075142396,-1.5364028478469014,6317.270274167462,7283.639237179088
2,Linear Support Vector Regression,MinMaxScaler(),25058.631674328557,30641.836583993107,-0.7828102785875015,-0.404825732940149,17440.02236738605,17686.969622178854
3,Support Vector Regression,MinMaxScaler(),18957.732819094672,26007.268403550803,-0.020383014018221,-0.0120042797488602,11256.181074449298,11530.070197317473
4,Stochastic Gradient Descent,MinMaxScaler(),12908.671422039728,34220.31587595389,0.526899380501215,-0.7521081013759763,6388.485244273707,7212.806879251486
5,Nearest Neighbors Regression,MinMaxScaler(),12864.589374324169,21615.30016283116,0.5301250625150353,0.3009384570884607,6809.43771079031,8858.19181917536
6,Decision tree regressor,MinMaxScaler(),170.08319975026623,22787.848960241343,0.9999178678372423,0.2230384374622541,3.941277220455136,8187.434078489817
7,RandomForest,MinMaxScaler(),7158.061627771404,19209.506060671243,0.8545273700847444,0.4478904423225276,4127.537643701079,6273.426685330006
8,Gradient Tree Boosting,MinMaxScaler(),8603.423088287669,19008.266249511325,0.7898482624292056,0.4593977079822471,5103.0473536810105,6364.757212015874
9,AdaBoost regressor,MinMaxScaler(),15214.364311921996,21829.496163504347,0.3427992556772288,0.2870151631137034,12007.627785691811,12211.656193616953


In [78]:
res_bat_clean = res_bat.copy()

# features = ['system_size_DC', 'total_installed_price',        
#         'inverter_loading_ratio', 'module_quantity','module_efficiency',
#         'inverter_quantity', 'inverter_total_capacity']

features = ['system_size_DC', 'total_installed_price','inverter_loading_ratio', 'module_quantity','module_efficiency', 'inverter_total_capacity']


res_bat_clean.head()

for col in features:

        alaraja, ylaraja = np.percentile(a=res_bat_clean[col], q=[25,75])
        iqr = ylaraja - alaraja
        print((f"{col} Alaraja = {alaraja} Yläraja = {ylaraja} IQR = {iqr}"))

        # Tehdään uudet yla- ja alarajat missä mukana 1.5*IQR

        alaraja_iqr = alaraja - 1.5*iqr
        ylaraja_iqr = ylaraja + 1.5*iqr

        res_bat_clean = res_bat_clean[(res_bat_clean[col] > alaraja_iqr) & (res_bat_clean[col] < ylaraja_iqr)]


res_bat_clean.info()

system_size_DC Alaraja = 6.12 Yläraja = 9.28 IQR = 3.1599999999999993
total_installed_price Alaraja = 27749.8 Yläraja = 42249.0 IQR = 14499.2
inverter_loading_ratio Alaraja = 0.909926470588235 Yläraja = 1.16279069767442 IQR = 0.25286422708618506
module_quantity Alaraja = 19.0 Yläraja = 28.0 IQR = 9.0
module_efficiency Alaraja = 0.180379746835443 Yläraja = 0.200157214393851 IQR = 0.019777467558408007
inverter_total_capacity Alaraja = 6.239999999999982 Yläraja = 7.616000000000001 IQR = 1.3760000000000199
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4386 entries, 67 to 593151
Data columns (total 28 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   installation_date              4386 non-null   int64  
 1   system_size_DC                 4386 non-null   float64
 2   total_installed_price          4386 non-null   float64
 3   rebate_or_grant                4386 non-null   float64
 4   expansion_system 

In [79]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


y = res["total_installed_price"]
X = res.drop(columns="total_installed_price")

features = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) 


scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=features)
X_test = pd.DataFrame(scaler.transform(X_test), columns=features)

In [80]:
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

regr = linear_model.LinearRegression()

regr.fit(X_train,y_train)

ypred = regr.predict(X_test)

error_regr = sqrt(mean_squared_error(y_test, ypred))
mae_regr = mean_absolute_error(y_true=y_test,y_pred=ypred)
print(f"Error={error_regr}")
print(f"MAE={mae_regr}")


Error=12414.937426890789
MAE=6346.131281870302


In [81]:
# from sklearn.ensemble import RandomForestRegressor

# rfr = RandomForestRegressor(max_depth=10, random_state=42)
# rfr.fit(X_train, y_train)

# ypred = rfr.predict(X_test)

# error_rfr = mean_squared_error(y_test, ypred,squared=False)

# mae_rfr = mean_absolute_error(y_true=y_test,y_pred=ypred)

# print(f"Error={error_rfr}")
# print(f"MAE={mae_rfr}")



In [82]:
# from sklearn import svm

# svm = svm.SVR()

# svm.fit(X_train,y_train)

# ypred = svm.predict(X_test)


# error_svm = mean_squared_error(y_test, ypred,squared=False)

# mae_svm = mean_absolute_error(y_true=y_test,y_pred=ypred)

# print(f"Error={error_svm}")
# print(f"MAE={mae_svm}")