# Artificial Neural Network

### Importing the libraries

In [67]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [68]:
tf.__version__

'2.18.0'

## Part 1 - Data Preprocessing

### Importing the dataset

In [69]:
dataset = pd.read_csv('bunkerweb.csv')
X = dataset.iloc[:, 1:-1].values # The columns 0 has not impact the result (version).
y = dataset.iloc[:, -1].values

In [70]:
print(X)

[['docs/json2md.py' 'God Object']
 ['docs/json2md.py' 'Hub-like Dependency']
 ['docs/json2md.py' 'Orphan Module']
 ...
 ['tests/ui/wizard.py' 'Potential Redundant Abstractions']
 ['tests/ui/wizard.py' 'Scattered Functionality']
 ['tests/ui/wizard.py' 'Unstable Dependency']]


In [71]:
print(y)

[0 0 0 ... 0 0 1]


### Encoding categorical data

In [72]:
df = pd.DataFrame(X, columns=['file', 'smell'])

One Hot Encoding the following columns : "*filepath*", "*smell_type*", "*smell_id*", "*Severity*"

In [73]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

cat_cols = ['file', 'smell']
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols)],
    remainder='passthrough'  # pour garder 'version_count'
)

X_encoded = ct.fit_transform(df)

In [74]:
print(X_encoded)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 281456 stored elements and shape (140728, 366)>
  Coords	Values
  (0, 0)	1.0
  (0, 359)	1.0
  (1, 0)	1.0
  (1, 360)	1.0
  (2, 0)	1.0
  (2, 361)	1.0
  (3, 0)	1.0
  (3, 362)	1.0
  (4, 0)	1.0
  (4, 363)	1.0
  (5, 0)	1.0
  (5, 364)	1.0
  (6, 0)	1.0
  (6, 365)	1.0
  (7, 1)	1.0
  (7, 359)	1.0
  (8, 1)	1.0
  (8, 360)	1.0
  (9, 1)	1.0
  (9, 361)	1.0
  (10, 1)	1.0
  (10, 362)	1.0
  (11, 1)	1.0
  (11, 363)	1.0
  (12, 1)	1.0
  :	:
  (140715, 360)	1.0
  (140716, 357)	1.0
  (140716, 361)	1.0
  (140717, 357)	1.0
  (140717, 362)	1.0
  (140718, 357)	1.0
  (140718, 363)	1.0
  (140719, 357)	1.0
  (140719, 364)	1.0
  (140720, 357)	1.0
  (140720, 365)	1.0
  (140721, 358)	1.0
  (140721, 359)	1.0
  (140722, 358)	1.0
  (140722, 360)	1.0
  (140723, 358)	1.0
  (140723, 361)	1.0
  (140724, 358)	1.0
  (140724, 362)	1.0
  (140725, 358)	1.0
  (140725, 363)	1.0
  (140726, 358)	1.0
  (140726, 364)	1.0
  (140727, 358)	1.0
  (140727, 365)	1.0


### Splitting the dataset into the Training set and Test set

In [75]:
from sklearn.model_selection import train_test_split

# Get number of rows
n_rows = X_encoded.shape[0]

# Define the split index: 80% for training, 20% for testing
split_index = int(n_rows * 0.8)

# Split chronologically
X_train = X_encoded[:split_index]
X_test = X_encoded[split_index:]
y_train = y[:split_index]
y_test = y[split_index:]


### Feature Scaling

In [76]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
# In neural network it is really important to apply Feature Scaling on all features
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [77]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 225164 stored elements and shape (112582, 366)>
  Coords	Values
  (0, 0)	18.9316132544039
  (0, 359)	2.857674577002177
  (1, 0)	18.9316132544039
  (1, 360)	2.8577486098306815
  (2, 0)	18.9316132544039
  (2, 361)	2.8577486098306815
  (3, 0)	18.9316132544039
  (3, 362)	2.8577486098306815
  (4, 0)	18.9316132544039
  (4, 363)	2.8577486098306815
  (5, 0)	18.9316132544039
  (5, 364)	2.8577486098306815
  (6, 0)	18.9316132544039
  (6, 365)	2.8577486098306815
  (7, 1)	18.9316132544039
  (7, 359)	2.857674577002177
  (8, 1)	18.9316132544039
  (8, 360)	2.8577486098306815
  (9, 1)	18.9316132544039
  (9, 361)	2.8577486098306815
  (10, 1)	18.9316132544039
  (10, 362)	2.8577486098306815
  (11, 1)	18.9316132544039
  (11, 363)	2.8577486098306815
  (12, 1)	18.9316132544039
  :	:
  (112569, 361)	2.8577486098306815
  (112570, 285)	18.9316132544039
  (112570, 362)	2.8577486098306815
  (112571, 285)	18.9316132544039
  (112571, 363)	2.8577486098306

## Part 2 - Building the ANN

### Initializing the ANN

In [78]:
ann = tf.keras.models.Sequential()

### Adding the input layer and the first hidden layer

In [79]:
"""Units => number of neurons in the layer. There is not a rule about this number,
we have to do test to determine which one may be good for our project"""
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

### Adding the second hidden layer

In [80]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

### Adding the output layer

In [81]:
# "Units = 1" because 1 neuron can detect the entire target (0 or 1)
ann.add(tf.keras.layers.Dense(units=1))

## Part 3 - Training the ANN

### Compiling the ANN

In [82]:
#ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])
ann.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])  # ou 'mse'

### Training the ANN on the Training set

In [None]:
ann.fit(X_train, y_train, batch_size = 32, epochs = 100)

Epoch 1/100
[1m3519/3519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 0.1719 - mae: 0.1898
Epoch 2/100
[1m3519/3519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.0335 - mae: 0.0751
Epoch 3/100
[1m3519/3519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.0301 - mae: 0.0679
Epoch 4/100
[1m3519/3519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.0290 - mae: 0.0655
Epoch 5/100
[1m3519/3519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.0285 - mae: 0.0625
Epoch 6/100
[1m3519/3519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.0285 - mae: 0.0597
Epoch 7/100
[1m3519/3519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.0271 - mae: 0.0580
Epoch 8/100
[1m3519/3519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 0.0271 - mae: 0.0569
Epoch 9/100
[1m3519/3519[0m [32m━━━━━━━━━━━━━━━━━━━

## Part 4 - Making the predictions and evaluating the model

### Predicting the Test set results

In [None]:
y_pred = ann.predict(X_test)
#y_pred = (y_pred > 0.5)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), 1))

### Making the Confusion Matrix

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# y_pred est un tableau de float, il ne faut pas faire de seuil ici
print("MSE :", mean_squared_error(y_test, y_pred))
print("MAE :", mean_absolute_error(y_test, y_pred))
print("R²  :", r2_score(y_test, y_pred))

### Predict the presence of smells on the latest version