# Dependencies

In [1]:
import numpy as np
import pandas as pd

# Load

In [2]:
df = pd.read_csv("dataset.csv")
df.sample(5)

Unnamed: 0,distributor,fixed acidity,country,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1851,tico cat,6.1,south africa,0.34,0.46,4.7,0.029,21.0,94.0,0.991,3.29,0.62,12.3,6
2480,tico cat,6.8,south africa,0.32,0.3,1.0,0.049,22.0,113.0,0.99289,3.24,0.61,10.2,5
5381,lost watch,10.6,atlantida,0.36,0.6,2.2,0.152,7.0,18.0,0.9986,3.04,1.06,9.4,5
5511,lost watch,8.2,atlantida,0.39,0.38,1.5,0.058,10.0,29.0,0.9962,3.26,0.74,9.8,5
5065,lost watch,7.3,atlantida,0.55,0.03,1.6,0.072,17.0,42.0,0.9956,3.37,0.48,9.0,4


# Manipulação de dados

## 1. Groupby

In [3]:
df_by_distributor = df.groupby("distributor")[["fixed acidity", "pH"]].mean()
df_by_distributor = df_by_distributor.reset_index()
df_by_distributor

Unnamed: 0,distributor,fixed acidity,pH
0,boutique chic,6.842359,3.201796
1,ice danone,6.917132,3.217246
2,last poet,7.327224,3.188113
3,lost watch,7.693617,3.268851
4,reine ltda,6.678153,3.14775
5,tico cat,7.01032,3.193895


## 2. Merge

In [4]:
df_with_means = df.merge(df_by_distributor, left_on='distributor', right_on='distributor', suffixes=(None, " mean"))
df_with_means.sample(5)

Unnamed: 0,distributor,fixed acidity,country,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,fixed acidity mean,pH mean
5877,lost watch,12.2,atlantida,0.45,0.49,1.4,0.075,3.0,6.0,0.9969,3.13,0.63,10.4,5,7.693617,3.268851
2486,tico cat,6.7,south africa,0.24,0.32,10.3,0.079,37.0,122.0,0.99662,3.02,0.45,8.8,5,7.01032,3.193895
41,boutique chic,6.7,portugal,0.24,0.39,2.9,0.173,63.0,157.0,0.9937,3.1,0.34,9.4,6,6.842359,3.201796
5486,lost watch,5.0,atlantida,0.42,0.24,2.0,0.06,19.0,50.0,0.9917,3.72,0.74,14.0,8,7.693617,3.268851
5407,lost watch,13.3,atlantida,0.29,0.75,2.8,0.084,23.0,43.0,0.9986,3.04,0.68,11.4,7,7.693617,3.268851


## 3. Pivot table

In [5]:
df_alcohol = df
df_alcohol['alcohol'] = pd.to_numeric(df_alcohol['alcohol'], errors='coerce')
df_alcohol = df_alcohol.dropna(subset=['alcohol'])
df_alcohol.groupby(['country', 'quality'])['alcohol'].mean()

country            quality
atlantida          3          10.004545
                   4          10.353125
                   5           9.910048
                   6          10.767227
                   7          11.568406
                   8          11.321875
canada             3           9.833333
                   4           9.866667
                   5           9.810476
                   6          10.600505
                   7          10.959375
                   8          11.626667
                   9          12.900000
greenland          3          10.925000
                   4           9.975758
                   5           9.756017
                   6          10.244984
                   7          10.916970
                   8          11.397619
                   9          12.000000
portugal           3          10.375000
                   4          10.100000
                   5           9.613514
                   6          10.037838
             

In [6]:
df_quality_country = df_alcohol.pivot_table(index='country', columns='quality', values='alcohol') # aggfunc = np.mean() by default
df_quality_country = df_quality_country.fillna(0.)
df_quality_country

quality,3,4,5,6,7,8,9
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
atlantida,10.004545,10.353125,9.910048,10.767227,11.568406,11.321875,0.0
canada,9.833333,9.866667,9.810476,10.600505,10.959375,11.626667,12.9
greenland,10.925,9.975758,9.756017,10.244984,10.91697,11.397619,12.0
portugal,10.375,10.1,9.613514,10.037838,11.137113,11.95625,0.0
south africa,10.171429,10.1375,9.801429,10.583938,11.633945,11.722535,0.0
unit emirate arab,10.5,10.463636,9.934314,10.805846,11.356429,12.647059,0.0


# Machine Learning

In [7]:
# !conda install -c conda-forge imbalanced-learn

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [9]:
df.sample(5)

Unnamed: 0,distributor,fixed acidity,country,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
3484,reine ltda,6.2,south africa,0.37,0.24,6.1,0.032,19.0,86.0,0.98934,3.04,0.26,13.4,8
2029,tico cat,7.6,south africa,0.34,0.39,7.6,0.04,45.0,215.0,0.9965,3.11,0.53,9.2,6
1617,tico cat,6.2,canada,0.2,0.49,1.6,0.065,17.0,143.0,0.9937,3.22,0.52,9.2,6
4353,lost watch,7.4,atlantida,0.16,0.3,1.4,0.064,34.0,166.0,0.99136,3.11,0.42,,6
1853,tico cat,8.3,south africa,0.27,0.39,2.4,0.058,16.0,107.0,0.9955,3.28,0.59,10.3,5


## 1. Preprocessing

### Drop NaN

In [57]:
df.isnull().values.any()

False

In [11]:
df = df.dropna()

### Factorize textual values

In [13]:
factor = pd.factorize(df['distributor'])
df['distributor'] = factor[0]
distributor_definitions = factor[1]
distributor_definitions

Index(['boutique chic', 'ice danone', 'last poet', 'tico cat', 'reine ltda',
       'lost watch'],
      dtype='object')

In [14]:
factor = pd.factorize(df['country'])
df['country'] = factor[0]
country_definitions = factor[1]
country_definitions

Index(['portugal', 'greenland', 'canada', 'south africa', 'unit emirate arab',
       'atlantida'],
      dtype='object')

In [15]:
df.sample(5)

Unnamed: 0,distributor,fixed acidity,country,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
26,0,6.9,0,0.24,0.35,1.0,0.052,35.0,146.0,0.993,3.45,0.44,10.0,6
156,0,7.4,0,0.24,0.42,14.0,0.066,48.0,198.0,0.9979,2.89,0.42,8.9,6
5404,5,10.4,5,0.24,0.46,1.8,0.075,6.0,21.0,0.9976,3.25,1.02,10.8,7
3619,4,6.8,4,0.45,0.28,26.05,0.031,27.0,122.0,100.295,3.06,0.42,10.6,6
1463,2,8.3,2,0.19,0.49,1.2,0.051,11.0,137.0,0.9918,3.06,0.46,11.0,6


## 2. Train/Test

In [16]:
X = df.iloc[:,0:13].values
y = df.iloc[:,13].values
print(np.count_nonzero(y == 9))
print(np.count_nonzero(y == 3))
# data is highly unbalanced

5
30


### Oversampling

In [17]:
over_sampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
X, y = over_sampler.fit_resample(X, y)
X, y = over_sampler.fit_resample(X, y)
print(np.count_nonzero(y == 9))
print(np.count_nonzero(y == 3))

2815
2815


### Split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

### Fit/Predict

In [19]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [50]:
classifier = RandomForestClassifier(n_estimators = 25, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

In [51]:
y_pred = classifier.predict(X_test)

### Evaluation

In [52]:
set(y_test) - set(y_pred) # is there any data present in test, but missing in pred?

set()

In [53]:
print(pd.crosstab(y_test, y_pred, rownames=['True Qualities'], colnames=['Predicted Qualities']))

Predicted Qualities    3   4    5    6    7   8    9
True Qualities                                      
3                    836   0    0    0    0   0    0
4                      2  10   38   16    1   0    0
5                      1   6  444  153    6   1    0
6                      0   1  165  661   46   3    0
7                      1   0   13  170  155   3    0
8                      0   0    0   23   15  17    0
9                      0   0    0    0    0   0  829


In [54]:
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           3       1.00      1.00      1.00       836
           4       0.59      0.15      0.24        67
           5       0.67      0.73      0.70       611
           6       0.65      0.75      0.70       876
           7       0.70      0.45      0.55       342
           8       0.71      0.31      0.43        55
           9       1.00      1.00      1.00       829

    accuracy                           0.82      3616
   macro avg       0.76      0.63      0.66      3616
weighted avg       0.82      0.82      0.81      3616



## 3. Serialize

In [59]:
filename = 'vinegar.pkl'
pickle.dump(classifier, open(filename, 'wb'))