In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from rich import print # This makes print way prettier

In [2]:
input_data = pd.read_csv("apple_quality.csv")
input_data

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0.0,-3.970049,-2.512336,5.346330,-1.012009,1.844900,0.329840,-0.491590483,good
1,1.0,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.867530,-0.722809367,good
2,2.0,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636473,bad
3,3.0,-0.657196,-2.271627,1.324874,-0.097875,3.637970,-3.413761,0.790723217,good
4,4.0,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984036,good
...,...,...,...,...,...,...,...,...,...
3996,3996.0,-0.293118,1.949253,-0.204020,-0.640196,0.024523,-1.087900,1.854235285,good
3997,3997.0,-2.634515,-2.138247,-2.440461,0.657223,2.199709,4.763859,-1.334611391,bad
3998,3998.0,-4.008004,-1.779337,2.366397,-0.200329,2.161435,0.214488,-2.229719806,good
3999,3999.0,0.278540,-1.715505,0.121217,-1.154075,1.266677,-0.776571,1.599796456,good


In [3]:
input_data.isnull().sum()


A_id           1
Size           1
Weight         1
Sweetness      1
Crunchiness    1
Juiciness      1
Ripeness       1
Acidity        0
Quality        1
dtype: int64

In [4]:
input_data.dropna(inplace=True)

In [5]:
input_data.shape


(4000, 9)

In [6]:
input_data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   float64
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-null   float64
 5   Juiciness    4000 non-null   float64
 6   Ripeness     4000 non-null   float64
 7   Acidity      4000 non-null   object 
 8   Quality      4000 non-null   object 
dtypes: float64(7), object(2)
memory usage: 312.5+ KB


In [7]:
input_data['Acidity']=input_data['Acidity'].astype("float64")

In [8]:
input_data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   float64
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-null   float64
 5   Juiciness    4000 non-null   float64
 6   Ripeness     4000 non-null   float64
 7   Acidity      4000 non-null   float64
 8   Quality      4000 non-null   object 
dtypes: float64(8), object(1)
memory usage: 312.5+ KB


In [9]:
labels_array = input_data['Quality'].to_numpy()
labels_array

array(['good', 'good', 'bad', ..., 'bad', 'good', 'good'], dtype=object)

In [10]:
features_no = 7
data_array = np.zeros((len(input_data), features_no))
for i in range(len(input_data)):
    data_array[i] = input_data.iloc[i][1:8].to_numpy()

data_array

array([[-3.97004852, -2.51233638,  5.34632961, ...,  1.84490036,
         0.3298398 , -0.49159048],
       [-1.19521719, -2.83925653,  3.66405876, ...,  0.8532858 ,
         0.86753008, -0.72280937],
       [-0.29202386, -1.35128199, -1.73842916, ...,  2.83863551,
        -0.03803333,  2.62163647],
       ...,
       [-2.6345153 , -2.13824672, -2.44046129, ...,  2.19970859,
         4.76385918, -1.33461139],
       [-4.00800374, -1.77933711,  2.36639697, ...,  2.16143512,
         0.21448838, -2.22971981],
       [ 0.27853965, -1.71550503,  0.12121725, ...,  1.2666774 ,
        -0.77657147,  1.59979646]])

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(labels_array)
labels_array = le.fit_transform(labels_array)

labels_array

array([1, 1, 0, ..., 0, 1, 1])

In [12]:
from sklearn.preprocessing import normalize

data_array = normalize(data_array)
data_array


array([[-0.53321852, -0.33743272,  0.71806728, ...,  0.24778917,
         0.04430089, -0.06602568],
       [-0.22816433, -0.54200781,  0.69946074, ...,  0.16289038,
         0.16560958, -0.13798271],
       [-0.06532623, -0.30228405, -0.38888952, ...,  0.63500752,
        -0.00850812,  0.58646447],
       ...,
       [-0.38340833, -0.31118498, -0.35516711, ...,  0.3201297 ,
         0.69329766, -0.1942297 ],
       [-0.68177964, -0.30267332,  0.40253487, ...,  0.36766993,
         0.03648545, -0.37928546],
       [ 0.09216294, -0.56762473,  0.04010825, ...,  0.41911706,
        -0.25695126,  0.52933919]])

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_array, labels_array,
                                                    test_size=0.2, random_state=42)

In [14]:
from sklearn.metrics import classification_report # Our metric module


In [15]:
from sklearn.ensemble import RandomForestClassifier

quality_types = ["good", "bad"]

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

y_pred_rfc = rfc.predict(X_test)
print(classification_report(y_test, y_pred_rfc, target_names=quality_types))

In [16]:
from sklearn.ensemble import ExtraTreesClassifier

erfc = ExtraTreesClassifier()
erfc.fit(X_train, y_train)

y_pred_erfc = erfc.predict(X_test)
print(classification_report(y_test, y_pred_erfc, target_names=quality_types))

In [17]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
print(classification_report(y_test, y_pred_knn, target_names=quality_types))


In [18]:
from sklearn.svm import SVC

svc_lin = SVC(kernel='linear')
svc_lin.fit(X_train, y_train)

y_pred_svc_lin = svc_lin.predict(X_test)
print(classification_report(y_test, y_pred_svc_lin, target_names=quality_types))

In [19]:
svc_poly = SVC(kernel='poly')
svc_poly.fit(X_train, y_train)

y_pred_svc_poly = svc_poly.predict(X_test)
print(classification_report(y_test, y_pred_svc_poly, target_names=quality_types))

In [20]:
svc_rbf = SVC(kernel='rbf')
svc_rbf.fit(X_train, y_train)

y_pred_svc_rbf = svc_rbf.predict(X_test)
print(classification_report(y_test, y_pred_svc_rbf, target_names=quality_types))

In [21]:
import torch

X_train, X_test = torch.from_numpy(X_train).float(), torch.from_numpy(X_test).float()

X_train,X_test

(tensor([[ 0.3276, -0.5705, -0.0678,  ...,  0.2335,  0.5660, -0.2717],
         [-0.0610,  0.1417, -0.4052,  ...,  0.1196,  0.3613, -0.7980],
         [-0.3752, -0.4252, -0.1269,  ..., -0.0143,  0.7643, -0.2700],
         ...,
         [ 0.4325, -0.2937, -0.0474,  ..., -0.2400,  0.7884,  0.2124],
         [-0.5705, -0.3987,  0.5463,  ...,  0.1216,  0.1681, -0.2552],
         [-0.4607,  0.3505,  0.0743,  ...,  0.4484, -0.3466, -0.3898]]),
 tensor([[ 0.5633, -0.3074, -0.6687,  ...,  0.0978,  0.3017, -0.0990],
         [ 0.3915, -0.0609, -0.0167,  ...,  0.2408, -0.5960,  0.5310],
         [ 0.3910, -0.1649,  0.5626,  ..., -0.1802, -0.4316,  0.3270],
         ...,
         [-0.1331, -0.6713,  0.3996,  ...,  0.4894,  0.2141, -0.1561],
         [-0.3394, -0.0592,  0.1751,  ..., -0.0158,  0.5250, -0.7572],
         [ 0.0905, -0.3160, -0.0925,  ...,  0.5884,  0.0608, -0.3494]]))

In [22]:
y_train, y_test = torch.from_numpy(y_train).long(), torch.from_numpy(y_test).long()

y_train, y_test

(tensor([1, 1, 0,  ..., 1, 1, 1]),
 tensor([1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
         1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
         0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
         1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
         0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
         1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
         0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
         0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
         1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
         1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,

In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# If we have a gpu, then we'll move our splits into the gpu
# if not, nothing will change

X_train = X_train.to(device)
X_test = X_test.to(device)
y_train = y_train.to(device)
y_test = y_test.to(device)

In [24]:
class NeuralClassifier(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate):
        super(NeuralClassifier, self).__init__()
        
        # This is the input layer
        
        self.input_layer = torch.nn.Linear(input_size, hidden_size)

        # These are our hidden layers
        
        self.hidden_layer_1 = torch.nn.Linear(hidden_size, 2*hidden_size)
        self.hidden_layer_2 = torch.nn.Linear(2*hidden_size, 4*hidden_size)
        self.hidden_layer_3 = torch.nn.Linear(4*hidden_size, 8*hidden_size)
        self.hidden_layer_4 = torch.nn.Linear(8*hidden_size, 16*hidden_size)
        
        # This is our output layer
        
        self.output_layer = torch.nn.Linear(16*hidden_size, output_size)
        
        # This is our activation function
        
        self.relu = torch.nn.LeakyReLU()
        
        # And this is our dropout
        self.dropout = torch.nn.Dropout(dropout_rate)        
    def forward(self, x):

        x = self.input_layer(x)
        x = self.dropout(x)
        x = self.relu(x)

        # These are our hidden layers

        x = self.hidden_layer_1(x)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.hidden_layer_2(x)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.hidden_layer_3(x)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.hidden_layer_4(x)
        x = self.dropout(x)
        x = self.relu(x)
        
        # This is our output layer
        x = self.output_layer(x)

        return x

In [25]:
input_size = features_no
hidden_size = 100
output_size = len(quality_types)
dropout_rate = 0.38
learning_rate = 0.005
epochs = 400

# Initialize the model

model = NeuralClassifier(input_size, hidden_size, output_size, dropout_rate)

# If a gpu is available, we'll move the model to it
model.to(device)

# Define the loss function and the optimizer

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [26]:
# Here, we define our training loop

for epoch in range(epochs):
    # Forward pass
    output = model(X_train)
    loss = criterion(output, y_train)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Print loss for this epoch
    
    print(f"Epoch: {epoch+1}. Loss: {loss}")

In [27]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Just out of curiosity let's check the amount of trainable parameters of our NN
count_parameters(model)

1707002

In [28]:
model.eval()
output = model(X_test)
y_pred_nn = torch.argmax(output, dim=1)

y_pred_nn


tensor([1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
        0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
        1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
        1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
        1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
        0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,

In [29]:
print(classification_report(y_test.cpu(), y_pred_nn.cpu(), target_names=quality_types))
