In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Data Loading and Visualization

In [2]:
# Read salary data
file_path = Path("./appleresources/apple_quality.csv")
df_apple = pd.read_csv(file_path)

# Display sample data
df_apple.head()

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0.0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.491590483,good
1,1.0,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809367,good
2,2.0,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636473,bad
3,3.0,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723217,good
4,4.0,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984036,good


In [3]:
print(df_apple.shape)

(4001, 9)


In [None]:
# sns.scatterplot(df_apple, x='Ripeness', y='Size', hue="Quality")
# plt.grid(True)
# plt.show()

In [None]:
# sns.scatterplot(data, x='Ripeness', y='Weight', hue="Quality")
# plt.grid(True)
# plt.show()

In [None]:
# sns.scatterplot(data, x='Ripeness', y='Sweetness', hue="Quality")
# plt.grid(True)
# plt.show()

In [None]:
# sns.scatterplot(data, x='Ripeness', y='Crunchiness', hue="Quality")
# plt.grid(True)
# plt.show()

In [None]:
# sns.scatterplot(data, x='Ripeness', y='Juiciness', hue="Quality")
# plt.grid(True)
# plt.show()

In [None]:
# sns.scatterplot(data, x='Ripeness', y='Acidity', hue="Quality")
# plt.grid(True)
# plt.show()

# LOGISTIC REGRESSION

STEP 1 - PREPROCESS

In [4]:
# Find null values
print(df_apple.isnull().sum())

A_id           1
Size           1
Weight         1
Sweetness      1
Crunchiness    1
Juiciness      1
Ripeness       1
Acidity        0
Quality        1
dtype: int64


In [5]:
# Remove null values
df_apple_cleaned = df_apple.dropna()

In [6]:
# Show cleaned results
print(df_apple_cleaned.isnull().sum())

A_id           0
Size           0
Weight         0
Sweetness      0
Crunchiness    0
Juiciness      0
Ripeness       0
Acidity        0
Quality        0
dtype: int64


In [7]:
# Identify Dtype
df_apple_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   float64
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-null   float64
 5   Juiciness    4000 non-null   float64
 6   Ripeness     4000 non-null   float64
 7   Acidity      4000 non-null   object 
 8   Quality      4000 non-null   object 
dtypes: float64(7), object(2)
memory usage: 312.5+ KB


In [8]:
# Convert "Acidity" dtype to numerical value
df_apple_cleaned['Acidity'] = df_apple_cleaned['Acidity'].astype(float)
print(df_apple_cleaned.dtypes)

A_id           float64
Size           float64
Weight         float64
Sweetness      float64
Crunchiness    float64
Juiciness      float64
Ripeness       float64
Acidity        float64
Quality         object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_apple_cleaned['Acidity'] = df_apple_cleaned['Acidity'].astype(float)


In [9]:
# Transform "Quality" colulmn with encoding function
def encode_Quality(Quality):
    if Quality == "good":
        return 1
    else:
        return 0


In [10]:
# Call the encode_Quality function on the Quality column
df_apple_cleaned['Quality'] = df_apple_cleaned['Quality'].apply(encode_Quality)

df_apple_cleaned['Quality'] = df_apple_cleaned['Quality'].astype(float)

# Review the DataFrame
df_apple_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_apple_cleaned['Quality'] = df_apple_cleaned['Quality'].apply(encode_Quality)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_apple_cleaned['Quality'] = df_apple_cleaned['Quality'].astype(float)


Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0.0,-3.970049,-2.512336,5.346330,-1.012009,1.844900,0.329840,-0.491590,1.0
1,1.0,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.867530,-0.722809,1.0
2,2.0,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,0.0
3,3.0,-0.657196,-2.271627,1.324874,-0.097875,3.637970,-3.413761,0.790723,1.0
4,4.0,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,1.0
...,...,...,...,...,...,...,...,...,...
3995,3995.0,0.059386,-1.067408,-3.714549,0.473052,1.697986,2.244055,0.137784,0.0
3996,3996.0,-0.293118,1.949253,-0.204020,-0.640196,0.024523,-1.087900,1.854235,1.0
3997,3997.0,-2.634515,-2.138247,-2.440461,0.657223,2.199709,4.763859,-1.334611,0.0
3998,3998.0,-4.008004,-1.779337,2.366397,-0.200329,2.161435,0.214488,-2.229720,1.0


In [11]:
# Remove "A_id" column
data = df_apple_cleaned.drop(columns=['A_id'])
data

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,-3.970049,-2.512336,5.346330,-1.012009,1.844900,0.329840,-0.491590,1.0
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.867530,-0.722809,1.0
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,0.0
3,-0.657196,-2.271627,1.324874,-0.097875,3.637970,-3.413761,0.790723,1.0
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,1.0
...,...,...,...,...,...,...,...,...
3995,0.059386,-1.067408,-3.714549,0.473052,1.697986,2.244055,0.137784,0.0
3996,-0.293118,1.949253,-0.204020,-0.640196,0.024523,-1.087900,1.854235,1.0
3997,-2.634515,-2.138247,-2.440461,0.657223,2.199709,4.763859,-1.334611,0.0
3998,-4.008004,-1.779337,2.366397,-0.200329,2.161435,0.214488,-2.229720,1.0


In [14]:
# Prepare the data using the StandardScaler model andfit_transform function to scall all columns with numerical values
data_scaled = StandardScaler().fit_transform(data[['Size','Weight','Sweetness','Crunchiness','Juiciness','Ripeness',
                                                   'Acidity']])
# Display the scaled data
data_scaled

array([[-1.79842417, -0.95037339,  2.99342063, ...,  0.69054495,
        -0.08987211, -0.26941526],
       [-0.35906018, -1.15440431,  2.12769769, ...,  0.17676683,
         0.1970196 , -0.37899737],
       [ 0.1094454 , -0.22575916, -0.65250727, ...,  1.20542179,
        -0.28615565,  1.20604367],
       ...,
       [-1.1056547 , -0.71690397, -1.01378401, ...,  0.87437918,
         2.27595716, -0.66895013],
       [-1.81811235, -0.49290842,  1.45990059, ...,  0.85454883,
        -0.15141937, -1.09317096],
       [ 0.40540882, -0.45307081,  0.30449592, ...,  0.39095445,
        -0.68021237,  0.72176064]])

In [15]:
# Create a DataFrame with the scaled data
data_scaled_df = pd.DataFrame(
    data_scaled,
    columns=['Size','Weight','Sweetness','Crunchiness','Juiciness','Ripeness','Acidity'])
data_scaled_df

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity
0,-1.798424,-0.950373,2.993421,-1.424150,0.690545,-0.089872,-0.269415
1,-0.359060,-1.154404,2.127698,0.429746,0.176767,0.197020,-0.378997
2,0.109445,-0.225759,-0.652507,-0.946892,1.205422,-0.286156,1.206044
3,-0.079977,-0.800146,0.923916,-0.772399,1.619575,-2.087320,0.338315
4,0.968573,-0.191640,0.044164,-1.096894,1.305025,-0.961548,0.201472
...,...,...,...,...,...,...,...
3995,0.291729,-0.048594,-1.669449,-0.365345,0.614425,0.931482,0.028866
3996,0.108878,1.834105,0.137124,-1.159058,-0.252634,-0.846326,0.842347
3997,-1.105655,-0.716904,-1.013784,-0.234036,0.874379,2.275957,-0.668950
3998,-1.818112,-0.492908,1.459901,-0.845446,0.854549,-0.151419,-1.093171


In [22]:
# Concatenate the data_saled_df and encoded 'Quality'.
Quality = df_apple_cleaned['Quality']
data_scaled_df = pd.DataFrame(data_scaled, columns=['Size', 'Weight', 'Sweetness', 'Crunchiness','Juiciness', 'Ripeness', 'Acidity'])
data_scaled_df['Quality'] = Quality
data_scaled_df

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,-1.798424,-0.950373,2.993421,-1.424150,0.690545,-0.089872,-0.269415,1.0
1,-0.359060,-1.154404,2.127698,0.429746,0.176767,0.197020,-0.378997,1.0
2,0.109445,-0.225759,-0.652507,-0.946892,1.205422,-0.286156,1.206044,0.0
3,-0.079977,-0.800146,0.923916,-0.772399,1.619575,-2.087320,0.338315,1.0
4,0.968573,-0.191640,0.044164,-1.096894,1.305025,-0.961548,0.201472,1.0
...,...,...,...,...,...,...,...,...
3995,0.291729,-0.048594,-1.669449,-0.365345,0.614425,0.931482,0.028866,0.0
3996,0.108878,1.834105,0.137124,-1.159058,-0.252634,-0.846326,0.842347,1.0
3997,-1.105655,-0.716904,-1.013784,-0.234036,0.874379,2.275957,-0.668950,0.0
3998,-1.818112,-0.492908,1.459901,-0.845446,0.854549,-0.151419,-1.093171,1.0


In [23]:
# Review the data info
data_scaled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Size         4000 non-null   float64
 1   Weight       4000 non-null   float64
 2   Sweetness    4000 non-null   float64
 3   Crunchiness  4000 non-null   float64
 4   Juiciness    4000 non-null   float64
 5   Ripeness     4000 non-null   float64
 6   Acidity      4000 non-null   float64
 7   Quality      4000 non-null   float64
dtypes: float64(8)
memory usage: 250.1 KB


CASES:
(1) Apple Classification - categorize aplle based on features.
(2) Quality Prediction - predict quality rating of aplle using attributes.

Model Seection:
(1) Logistic Regression
(2) Decision Trees
(3) Random Forests
(4) Support Vector Machines
(5) Gradient Boosting Machines

In [24]:
# Separate the features from the target.
y = data_scaled_df['Quality']
X = data_scaled_df.drop(columns="Quality")

STEP 2 - TRAINING

In [25]:
# Split the data into training and testing data.

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=1,
                                                    stratify=y)
X_train.shape

(3000, 7)

In [36]:
# Create a Logistic Regression classifier.
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [37]:
# Train the model
classifier.fit(X_train, y_train)

STEP 3 - VALIDATE

In [38]:
# Get the training and testing data score.
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.7483333333333333
Testing Data Score: 0.754


STEP 4 - PREDICT

In [52]:
# Make predictions.
predictions = classifier.predict(X_test)
results = pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0.0,0.0
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0
4,1.0,1.0
5,0.0,0.0
6,0.0,1.0
7,1.0,1.0
8,0.0,0.0
9,0.0,0.0
