In [2]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


The above command is used to import all the necessary libraries.

In [3]:
df = pd.read_csv('wines_SPA.csv')

This line is added to load your data into a pandas DataFrame named df.

In [5]:
target_column = 'type'


X = df.drop(columns=target_column)
y = df[target_column]



numerical_features = X.select_dtypes(include=['number']).columns


X_numerical = X[numerical_features]


X_train, X_test, y_train, y_test = train_test_split(X_numerical, y, test_size=0.2, random_state=42)


scaler = StandardScaler()


X_train_scaled = scaler.fit_transform(X_train)


X_test_scaled = scaler.transform(X_test)

The above commands perpares the data for machine learning model by:
1.Selecting numerical features: Focuses on numerical data suitable for scaling.
2.Scaling features: Standardizes numerical features to a similar range.
3.Splitting data: Prepares data for training and evaluating a machine learning model.

In [18]:
# Necessary Imports
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)


imputer_y = SimpleImputer(strategy='most_frequent')
y_train = imputer_y.fit_transform(y_train.reshape(-1, 1))
y_train = y_train.ravel()

# Use Logistic Regression for classification (increase max_iter to avoid convergence issues)
lr = LogisticRegression(max_iter=200)  # Increased the max_iter to 200
lr.fit(X_train_scaled, y_train)  # Fit model with imputed and scaled data
y_pred_lr = lr.predict(X_test_scaled)  # Make predictions on test set

# Check types of y_test and y_pred_lr
print("y_test type:", type(y_test))
print("y_pred_lr type:", type(y_pred_lr))

# Ensure y_test and y_pred_lr are both strings or both numeric
y_test = y_test.astype(str)  # Convert to strings if necessary
y_pred_lr = y_pred_lr.astype(str)  # Convert to strings if necessary


accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {accuracy_lr}")


y_test type: <class 'pandas.core.series.Series'>
y_pred_lr type: <class 'numpy.ndarray'>
Logistic Regression Accuracy: 0.5513333333333333


The above code is used to perform logistic regression.

In [21]:
from sklearn.metrics import precision_score, accuracy_score
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)


print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Precision (macro average):", precision_score(y_test, y_pred_dt, average='macro'))

Decision Tree Accuracy: 0.8
Precision (macro average): 0.4758459366645635


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The above commands are used to make decision trees.

In [24]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision (macro average):", precision_score(y_test, y_pred_rf, average='macro'))

Random Forest Accuracy: 0.8146666666666667
Precision (macro average): 0.5484754512326666


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The commands calculate precision for a multiclass classification dataset.

In [26]:
svm = SVC(random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)

# Evaluation metrics
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Precision (macro average):", precision_score(y_test, y_pred_svm, average='macro'))

SVM Accuracy: 0.5466666666666666
Precision (macro average): 0.24666697778197919


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The above code is used to form a Support Vector Machine.

In [28]:
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)

# Evaluation metrics
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Precision (macro average):", precision_score(y_test, y_pred_knn, average='macro'))

KNN Accuracy: 0.808
Precision (macro average): 0.506148218539852


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The above code performs KNN.
K-Nearest Neighbors (KNN) is a simple and powerful algorithm for classification and regression tasks. It is most effective when the data has a clear, consistent pattern and when you can compute distances easily. However, it can be computationally expensive for large datasets and sensitive to feature scaling, which makes preprocessing crucial.

In [30]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score

# Impute missing values using the 'mean' strategy (replace NaNs with the mean value of each column)
imputer = SimpleImputer(strategy='mean')  # For numerical data
X_train_imputed = imputer.fit_transform(X_train)  # Fit and transform on training data
X_test_imputed = imputer.transform(X_test)  # Only transform on test data

# Now train the Gradient Boosting model
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train_imputed, y_train)  # Fit model on imputed training data
y_pred_gb = gb.predict(X_test_imputed)  # Predict on imputed test data

# Evaluate the model
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))

# If multiclass, make sure to adjust precision scoring to handle multiple classes
print("Precision:", precision_score(y_test, y_pred_gb, average='macro'))  # Adjust 'average' if needed for your data


Gradient Boosting Accuracy: 0.8166666666666667
Precision: 0.5009857771647923


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The above code performs Gradient Boosting that is a powerful ensemble learning technique that combines multiple weak learners (usually shallow decision trees) to create a strong, highly accurate model. It works by training a sequence of models, each focusing on correcting the errors of the previous model. Despite its high accuracy,

In [31]:
def evaluate_model(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In the above code For all the models it  can calculate common metrics such as accuracy, precision, recall, F1-score, and confusion matrix.