In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error
import seaborn as sns
import matplotlib as plt
from statsmodels.formula.api import ols

In [2]:
wine = pd.read_csv('wine_data.csv')

In [3]:
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,white
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,white
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,white
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,white
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,red
6493,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,red
6494,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,red
6495,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,red


In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Assuming your DataFrame is named 'data' and has a column named 'type'
le = LabelEncoder()
wine['type'] = le.fit_transform(wine['type'])

wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,1
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,1
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,1
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,1
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,0
6493,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,0
6494,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,0
6495,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,0


In [21]:
selected_features = wine.columns.tolist()  # Get all columns

correlation_report = wine[selected_features].corr(method='spearman')  # Spearman correlation

print("Correlation report for 'quality':")
print(correlation_report['quality'])

Correlation report for 'quality':
fixed acidity          -0.098154
volatile acidity       -0.257806
citric acid             0.105711
residual sugar         -0.016891
chlorides              -0.295054
free sulfur dioxide     0.086865
total sulfur dioxide   -0.054777
density                -0.322806
pH                      0.032538
sulphates               0.029831
alcohol                 0.446925
quality                 1.000000
type                    0.123001
Name: quality, dtype: float64


**Positive Correlations:**

*Alcohol* (**0.446925**): This indicates a moderate positive correlation between alcohol content and wine quality. Wines with higher alcohol content tend to have slightly better quality scores.


**Negative Correlations:**

*Volatile acidity* (**-0.257806**): This is a moderate negative correlation, indicating that wines with lower volatile acidity tend to be associated with higher quality.

*Chlorides* (**-0.295054**): This is a moderate negative correlation, suggesting that lower chloride content might be linked to better quality wines.

*Density* (**-0.322806**): This is a moderate negative correlation, indicating a tendency for wines with lower density to have higher quality scores.

# Random Forest

In [13]:
X = wine.drop('quality', axis=1)
y = wine['quality']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

y_pred_rf = model_rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_rf)
print("Random Forest Classifier Accuracy:", accuracy)

Random Forest Classifier Accuracy: 0.6946153846153846


# Linear Regression

In [29]:
selected_features = ['alcohol', 'volatile acidity', 'chlorides', 'density']

X = wine[selected_features]
y = wine['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Make predictions on the testing set (same as before)
y_pred_lr = model_lr.predict(X_test)

# Evaluate model performance (mean squared error)
mse = mean_squared_error(y_test, y_pred_lr)
print("Linear Regression Mean Squared Error:", mse)


Linear Regression Mean Squared Error: 0.48961114487477203


# SVM

In [30]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

X = wine.drop('quality', axis=1)
y = wine['quality']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (SVM can be sensitive to scale)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define and train the SVM model with linear kernel (experiment with kernels)
model_svm = SVC(kernel='linear', random_state=42)  # Linear kernel as an example
model_svm.fit(X_train_scaled, y_train)

# Make predictions on the testing set
y_pred_svm = model_svm.predict(X_test_scaled)

# Evaluate model performance (mean squared error)
mse = mean_squared_error(y_test, y_pred_svm)
print("SVM Mean Squared Error:", mse)


SVM Mean Squared Error: 0.6153846153846154
