# Dependencies

In [1]:
import numpy as np
import pandas as pd

# Load

In [2]:
df = pd.read_csv("dataset.csv")
df.sample(5)

Unnamed: 0,distributor,fixed acidity,country,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
6386,lost watch,5.6,atlantida,0.54,0.04,1.7,0.049,5.0,13.0,0.9942,3.72,0.58,11.4,5
5078,lost watch,8.8,atlantida,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,5
3901,reine ltda,4.8,unit emirate arab,0.65,0.12,1.1,0.013,4.0,10.0,0.99246,3.32,0.36,13.5,4
3346,tico cat,6.7,south africa,0.18,0.24,10.3,0.057,64.0,185.0,0.99519,3.12,0.5,10.6,6
3992,reine ltda,6.7,unit emirate arab,0.19,0.32,3.7,0.041,26.0,76.0,0.99173,2.9,0.57,10.5,7


# Manipulação de dados

## 1. Groupby

In [3]:
df_by_distributor = df.groupby("distributor")[["fixed acidity", "pH"]].mean()
df_by_distributor = df_by_distributor.reset_index()
df_by_distributor

Unnamed: 0,distributor,fixed acidity,pH
0,boutique chic,6.842359,3.201796
1,ice danone,6.917132,3.217246
2,last poet,7.327224,3.188113
3,lost watch,7.693617,3.268851
4,reine ltda,6.678153,3.14775
5,tico cat,7.01032,3.193895


## 2. Merge

In [4]:
df_with_means = df.merge(df_by_distributor, left_on='distributor', right_on='distributor', suffixes=(None, " mean"))
df_with_means.sample(5)

Unnamed: 0,distributor,fixed acidity,country,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,fixed acidity mean,pH mean
543,ice danone,6.7,portugal,0.24,0.41,8.7,0.036,29.0,148.0,0.9952,3.22,0.62,9.9,6,6.917132,3.217246
1362,last poet,6.4,greenland,0.25,0.33,1.4,0.04,42.0,115.0,0.9906,3.19,0.48,11.3,7,7.327224,3.188113
2334,tico cat,7.5,south africa,0.23,0.35,17.8,0.058,128.0,212.0,100.241,3.44,0.43,8.9,5,7.01032,3.193895
4991,lost watch,7.7,atlantida,0.49,0.26,1.9,0.062,9.0,31.0,0.9966,3.39,0.64,9.6,5,7.693617,3.268851
1001,ice danone,6.4,greenland,0.28,0.41,6.8,0.045,61.0,216.0,0.9952,3.09,0.46,9.4,5,6.917132,3.217246


## 3. Pivot table

In [5]:
df_alcohol = df
df_alcohol['alcohol'] = pd.to_numeric(df_alcohol['alcohol'], errors='coerce')
df_alcohol = df_alcohol.dropna(subset=['alcohol'])
df_alcohol.groupby(['country', 'quality'])['alcohol'].mean()

country            quality
atlantida          3          10.004545
                   4          10.353125
                   5           9.910048
                   6          10.767227
                   7          11.568406
                   8          11.321875
canada             3           9.833333
                   4           9.866667
                   5           9.810476
                   6          10.600505
                   7          10.959375
                   8          11.626667
                   9          12.900000
greenland          3          10.925000
                   4           9.975758
                   5           9.756017
                   6          10.244984
                   7          10.916970
                   8          11.397619
                   9          12.000000
portugal           3          10.375000
                   4          10.100000
                   5           9.613514
                   6          10.037838
             

In [6]:
df_quality_country = df_alcohol.pivot_table(index='country', columns='quality', values='alcohol') # aggfunc = np.mean() by default
df_quality_country = df_quality_country.fillna(0.)
df_quality_country

quality,3,4,5,6,7,8,9
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
atlantida,10.004545,10.353125,9.910048,10.767227,11.568406,11.321875,0.0
canada,9.833333,9.866667,9.810476,10.600505,10.959375,11.626667,12.9
greenland,10.925,9.975758,9.756017,10.244984,10.91697,11.397619,12.0
portugal,10.375,10.1,9.613514,10.037838,11.137113,11.95625,0.0
south africa,10.171429,10.1375,9.801429,10.583938,11.633945,11.722535,0.0
unit emirate arab,10.5,10.463636,9.934314,10.805846,11.356429,12.647059,0.0


# Machine Learning

In [7]:
# !conda install -c conda-forge imbalanced-learn

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [9]:
df.sample(5)

Unnamed: 0,distributor,fixed acidity,country,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
548,ice danone,6.5,portugal,0.18,0.31,1.7,0.044,30.0,127.0,0.9928,3.49,0.5,10.2,7
5510,lost watch,7.5,atlantida,0.64,0.0,2.4,0.077,18.0,29.0,0.9965,3.32,0.6,10.0,6
4555,lost watch,6.8,atlantida,0.45,0.3,11.8,0.094,23.0,97.0,0.997,3.09,0.44,9.6,5
2520,tico cat,7.5,south africa,0.41,0.23,14.8,0.054,28.0,174.0,0.99898,3.18,0.49,9.7,5
4586,lost watch,5.4,atlantida,0.29,0.38,1.2,0.029,31.0,132.0,0.98895,3.28,0.36,12.4,6


## 1. Preprocessing

### Drop NaN

In [10]:
df.isnull().values.any()

True

In [11]:
df = df.dropna()

### Factorize textual values

In [12]:
factor = pd.factorize(df['distributor'])
df['distributor'] = factor[0]
distributor_definitions = factor[1]
distributor_definitions

Index(['boutique chic', 'ice danone', 'last poet', 'tico cat', 'reine ltda',
       'lost watch'],
      dtype='object')

In [13]:
factor = pd.factorize(df['country'])
df['country'] = factor[0]
country_definitions = factor[1]
country_definitions

Index(['portugal', 'greenland', 'canada', 'south africa', 'unit emirate arab',
       'atlantida'],
      dtype='object')

In [14]:
df.sample(5)

Unnamed: 0,distributor,fixed acidity,country,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1356,2,7.3,1,0.22,0.41,15.4,0.05,55.0,191.0,1.0,3.32,0.59,8.9,6
338,0,5.9,0,0.26,0.4,1.3,0.047,12.0,139.0,0.9945,3.45,0.53,10.4,5
1095,1,5.3,1,0.24,0.33,1.3,0.033,25.0,97.0,0.9906,3.59,0.38,11.0,8
1014,1,6.1,1,0.16,0.27,12.6,0.064,63.0,162.0,0.9994,3.66,0.43,8.9,5
6059,5,8.8,5,0.45,0.43,1.4,0.076,12.0,21.0,0.99551,3.21,0.75,10.2,6


## 2. Train/Test

In [15]:
X = df.iloc[:,0:13].values
y = df.iloc[:,13].values
print(np.count_nonzero(y == 9))
print(np.count_nonzero(y == 3))
# data is highly unbalanced

5
30


### Oversampling

In [16]:
over_sampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
X, y = over_sampler.fit_resample(X, y)
X, y = over_sampler.fit_resample(X, y)
print(np.count_nonzero(y == 9))
print(np.count_nonzero(y == 3))

2815
2815


### Split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

### Fit/Predict

In [18]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
classifier = RandomForestClassifier(n_estimators = 25, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

In [20]:
y_pred = classifier.predict(X_test)

### Evaluation

In [21]:
set(y_test) - set(y_pred) # is there any data present in test, but missing in pred?

set()

In [22]:
print(pd.crosstab(y_test, y_pred, rownames=['True Qualities'], colnames=['Predicted Qualities']))

Predicted Qualities    3   4    5    6    7   8    9
True Qualities                                      
3                    836   0    0    0    0   0    0
4                      2  10   38   16    1   0    0
5                      1   6  444  153    6   1    0
6                      0   1  165  661   46   3    0
7                      1   0   13  170  155   3    0
8                      0   0    0   23   15  17    0
9                      0   0    0    0    0   0  829


In [23]:
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           3       1.00      1.00      1.00       836
           4       0.59      0.15      0.24        67
           5       0.67      0.73      0.70       611
           6       0.65      0.75      0.70       876
           7       0.70      0.45      0.55       342
           8       0.71      0.31      0.43        55
           9       1.00      1.00      1.00       829

    accuracy                           0.82      3616
   macro avg       0.76      0.63      0.66      3616
weighted avg       0.82      0.82      0.81      3616



## 3. Serialize

In [24]:
filename = 'vinegar.pkl'
pickle.dump(classifier, open(filename, 'wb'))