In [1]:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings
from sklearn import preprocessing

In [2]:
# Mounting my google drive
from google.colab import drive

In [3]:
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [4]:
#Create DataFrame
ss = pd.read_csv('/content/gdrive/MyDrive/Biogaz_Flow/Real_Plant_Data.csv')

In [5]:
del ss['Date'] # Deleting the 'Date' column
ss.head()

Unnamed: 0,TMF,TKM_percent,TUKM_percent,Alcantine,Fatty_Accid,Total_Biogaz_Production
0,240,46.26,33.15,787.0,244.36,2113
1,120,44.77,22.02,983.0,413.54,1754
2,200,44.45,22.61,740.0,378.6,1884
3,160,44.31,26.64,761.0,353.66,1768
4,200,50.58,23.23,748.0,343.96,1715


In [6]:
d = preprocessing.StandardScaler().fit(ss) # Applying normalization on the data

In [7]:
d

StandardScaler(copy=True, with_mean=True, with_std=True)

In [9]:
scaled_df

array([[-1.45461464e+00, -1.23416448e-01,  1.13778768e+00,
        -2.84290906e-01, -6.22474256e-01, -1.23313873e+00],
       [-1.73122386e+00, -2.02531594e-01, -9.47259351e-02,
         2.25673406e-01,  1.49532706e-01, -1.32404502e+00],
       [-1.54681771e+00, -2.19522767e-01, -2.93905321e-02,
        -4.06578267e-01, -9.90649066e-03, -1.29112631e+00],
       ...,
       [-1.40720845e-01,  1.21941714e+00,  1.26956587e+00,
        -1.21315447e+00, -1.44809915e+00, -2.81787913e-01],
       [ 5.27751435e-01,  2.72048103e+00,  2.77228014e+00,
        -1.26519165e+00, -1.51403780e+00, -4.49673351e-01],
       [ 5.50802204e-01,  1.98773672e+00,  2.03919477e+00,
         4.99649131e-01, -2.46842854e-03,  1.43876371e-01]])

In [10]:
df = pd.DataFrame(scaled_df, columns=names)

In [11]:
df.head()

Unnamed: 0,TMF,TKM_percent,TUKM_percent,Alcantine,Fatty_Accid,Total_Biogaz_Production
0,-1.454615,-0.123416,1.137788,-0.284291,-0.622474,-1.233139
1,-1.731224,-0.202532,-0.094726,0.225673,0.149533,-1.324045
2,-1.546818,-0.219523,-0.029391,-0.406578,-0.009906,-1.291126
3,-1.639021,-0.226956,0.416883,-0.351939,-0.123713,-1.3205
4,-1.546818,0.105964,0.039267,-0.385763,-0.167977,-1.333921


In [12]:
#Function to test model performance changes with feature elimination MSE
def Mean_Square_Error(model, x_test, y_test):
	prediction = model.predict(x_test)
	print ("Mean Square error of model:", mean_squared_error(y_test, prediction))

In [13]:
#Setting a parameter for SVM model
C = 1.0

In [14]:
# Identifying the target feature by splitting the dataset
samples = df.filter(['TMF', 'TKM_percent', 'TUKM_percent', 'Alcantine', 'Fatty_Accid'])
scores = df.filter(['Total_Biogaz_Production'])

In [15]:
# Defining the number of features to investigate
nFeatures = len(df.columns) - 1

In [16]:
samples

Unnamed: 0,TMF,TKM_percent,TUKM_percent,Alcantine,Fatty_Accid
0,-1.454615,-0.123416,1.137788,-0.284291,-0.622474
1,-1.731224,-0.202532,-0.094726,0.225673,0.149533
2,-1.546818,-0.219523,-0.029391,-0.406578,-0.009906
3,-1.639021,-0.226956,0.416883,-0.351939,-0.123713
4,-1.546818,0.105964,0.039267,-0.385763,-0.167977
...,...,...,...,...,...
389,0.089787,-0.090496,-0.041572,-1.529723,-0.996476
390,0.158939,-0.551913,-0.503349,-1.610797,-1.239012
391,-0.140721,1.219417,1.269566,-1.213154,-1.448099
392,0.527751,2.720481,2.772280,-1.265192,-1.514038


In [17]:
scores

Unnamed: 0,Total_Biogaz_Production
0,-1.233139
1,-1.324045
2,-1.291126
3,-1.320500
4,-1.333921
...,...
389,-0.215444
390,-0.359780
391,-0.281788
392,-0.449673


In [18]:
rfeIndex = nFeatures

In [19]:
#Recursively eliminate features based on the lowest weight
while True:
	#Split into training and testing
	x_train, x_test, y_train, y_test = train_test_split(samples, scores, test_size = 0.50, train_size=0.50)
	
	#Create SVM model using a linear kernel
	model = svm.SVR(kernel='linear')
	coef = model.coef_

	#Print co-efficients of features
	for i in range(0, nFeatures):
		print(samples.columns[i-1],":", coef[0][i-1])
	
	#Find the minimum weight among features and eliminate the feature with the smallest weight
	min = coef[0][0]
	index = 0
	for i in range(0, rfeIndex):
		if min > coef[0][i-1]:
			index = index + 1
			min = coef[0][i-1]
	if len(samples.columns) == 1:
		print("After recursive elimination we have the", samples.columns[index], "feature with a score of:", min)
		Mean_Square_Error(model, x_test, y_test)
		break
	else:
		print ("Lowest feature weight is for", samples.columns[index], "with a value of:", min)
		print ("Dropping feature", samples.columns[index])  

		#Drop the feature in the 'samples' dataframe based on the lowest feature index
		samples.drop(samples.columns[index], axis = 1, inplace = True)
		Mean_Square_Error(model, x_test, y_test)
		print ("\n")
		rfeIndex = rfeIndex - 1
		nFeatures = nFeatures - 1

Fatty_Accid : 0.07744107976161818
TMF : 0.6710680668231053
TKM_percent : 0.12254539993615038
TUKM_percent : 0.03062924608462167
Alcantine : -0.029367434110912694
Lowest feature weight is for Alcantine with a value of: -0.029367434110912694
Dropping feature Alcantine
Mean Square error of model: 0.5023223193110067


Fatty_Accid : 0.031709249215428836
TMF : 0.6467676986384152
TKM_percent : 0.02031222704452118
TUKM_percent : 0.1219839492584329
Lowest feature weight is for TUKM_percent with a value of: 0.02031222704452118
Dropping feature TUKM_percent
Mean Square error of model: 0.5128328631164885


Fatty_Accid : 0.06914420935544324
TMF : 0.6254379318136416
TKM_percent : 0.11532973486578202
Lowest feature weight is for TKM_percent with a value of: 0.06914420935544324
Dropping feature TKM_percent
Mean Square error of model: 0.6207426767464528


Fatty_Accid : 0.14413648963999193
TMF : 0.6537133034718534
Lowest feature weight is for Fatty_Accid with a value of: 0.14413648963999193
Dropping fea

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
