In [35]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
%matplotlib inline
import pickle


In [36]:
wine = pd.read_csv('TrainingDataset.csv', delimiter = ";")

In [37]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,8.9,0.22,0.48,1.8,0.077,29.0,60.0,0.9968,3.39,0.53,9.4,6
1,7.6,0.39,0.31,2.3,0.082,23.0,71.0,0.9982,3.52,0.65,9.7,5
2,7.9,0.43,0.21,1.6,0.106,10.0,37.0,0.9966,3.17,0.91,9.5,5
3,8.5,0.49,0.11,2.3,0.084,9.0,67.0,0.9968,3.17,0.53,9.4,5
4,6.9,0.4,0.14,2.4,0.085,21.0,40.0,0.9968,3.43,0.63,9.7,6


In [38]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1279 entries, 0 to 1278
Data columns (total 12 columns):
fixed acidity           1279 non-null float64
volatile acidity        1279 non-null float64
citric acid             1279 non-null float64
residual sugar          1279 non-null float64
chlorides               1279 non-null float64
free sulfur dioxide     1279 non-null float64
total sulfur dioxide    1279 non-null float64
density                 1279 non-null float64
pH                      1279 non-null float64
sulphates               1279 non-null float64
alcohol                 1279 non-null float64
quality                 1279 non-null int64
dtypes: float64(11), int64(1)
memory usage: 120.0 KB


In [39]:
print(wine['quality'].min(), wine['quality'].max())
#since the quality of wine is ranging from 3 to 8, I have reduced the label to 0 -> Good (0, 5.5), 1 -> Medium(5.5, 7.5), 2 -> Bad(7.5, 10)

(3, 8)


In [40]:
bins = [0, 5.5, 7.5, 10]
labels = [0, 1, 2]
wine['quality'] = pd.cut(wine['quality'], bins=bins, labels=labels)
wine.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,8.9,0.22,0.48,1.8,0.077,29.0,60.0,0.9968,3.39,0.53,9.4,1
1,7.6,0.39,0.31,2.3,0.082,23.0,71.0,0.9982,3.52,0.65,9.7,0
2,7.9,0.43,0.21,1.6,0.106,10.0,37.0,0.9966,3.17,0.91,9.5,0
3,8.5,0.49,0.11,2.3,0.084,9.0,67.0,0.9968,3.17,0.53,9.4,0
4,6.9,0.4,0.14,2.4,0.085,21.0,40.0,0.9968,3.43,0.63,9.7,1


In [41]:
X_train = wine.drop('quality', axis = 1)
y_train = wine['quality']
 
test_data = pd.read_csv('ValidationDataset.csv', delimiter = ";")
X_test = test_data.drop('quality', axis = 1)
test_data['quality'] = pd.cut(test_data['quality'], bins=bins, labels=labels)
y_test = test_data['quality']


In [42]:
sc = StandardScaler()
# to make data scaler 

In [43]:
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)


In [44]:
rfc_model = RandomForestClassifier(n_estimators=200)
rfc_model.fit(X_train, y_train)
pred_rfc = rfc_model.predict(X_test)

In [45]:
cross_val = cross_val_score(estimator = rfc_model, X = X_train, y = y_train, cv = 10 )
print(classification_report(y_test, pred_rfc))
print("Model Accuracy is " +str(cross_val.mean() * 100))


              precision    recall  f1-score   support

           0       0.69      0.80      0.74        69
           1       0.77      0.70      0.73        87
           2       0.00      0.00      0.00         4

   micro avg       0.72      0.72      0.73       160
   macro avg       0.49      0.50      0.49       160
weighted avg       0.72      0.72      0.72       160

Model Accuracy is 72.36220472440944


In [46]:
pkl_filename = "winePredictionModel.sav"
with open(pkl_filename, 'wb') as file:
    pickle.dump(rfc_model, file)

In [47]:
#to test the model on hardcoded data 
d = {'fixed acidity': [8.9], 
     'volatile acidity': [0.22],  
     'citric acid':[0.48],  
     'residual sugar':[1.8], 
     'chlorides':[0.077], 
     'free sulfur dioxide': [29.0],
     'total sulfur dioxide': [60.0],
     'density': [0.9968],
     'pH':[3.39],
     'sulphates':[0.53],
     'alcohol':[9.4],
      }
df = pd.DataFrame(data=d)
model = pickle.load(open(pkl_filename, 'rb'))
quality = model.predict(df)
labels = ['Bad','Medium', 'Good']
print(labels[quality[0]])

Bad


array([0])