## Importação dos pacotes

In [83]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [84]:
# definir parâmetros extras
pd.set_option('precision', 3)
pd.set_option('display.max_columns', 100)

## Carga dos dados

In [85]:
# carregar arquivo de dados de treino
data = pd.read_csv('wine-train.csv', index_col='wine')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4783,6.7,0.3,0.5,12.1,0.045,38.0,127.0,0.997,3.04,0.53,8.9,bad
2382,6.9,0.15,0.28,4.4,0.029,14.0,107.0,0.993,3.24,0.46,10.4,good
2303,6.9,0.44,0.18,11.8,0.051,26.0,126.0,0.998,3.23,0.48,9.1,bad
4254,7.5,0.29,0.24,9.9,0.058,25.0,115.0,0.996,3.15,0.46,10.9,bad
1593,8.6,0.16,0.49,7.3,0.043,9.0,63.0,0.995,3.13,0.59,10.5,bad


In [86]:
# quantas linhas e colunas existem?
data.shape

(3265, 12)

## Análise dos dados

In [87]:
# quais são as colunas e respectivos tipos de dados?
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3265 entries, 4783 to 2658
Data columns (total 12 columns):
fixed_acidity           3265 non-null float64
volatile_acidity        3265 non-null float64
citric_acid             3265 non-null float64
residual_sugar          3265 non-null float64
chlorides               3265 non-null float64
free_sulfur_dioxide     3265 non-null float64
total_sulfur_dioxide    3265 non-null float64
density                 3265 non-null float64
ph                      3265 non-null float64
sulphates               3265 non-null float64
alcohol                 3265 non-null float64
quality                 3265 non-null object
dtypes: float64(11), object(1)
memory usage: 318.8+ KB


In [88]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [89]:
data.isna().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
ph                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [90]:
# sumário estatístico das características numéricas
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed_acidity,3265.0,6.857,0.857,3.9,6.3,6.8,7.3,11.8
volatile_acidity,3265.0,0.278,0.101,0.08,0.21,0.26,0.32,1.005
citric_acid,3265.0,0.334,0.123,0.0,0.26,0.32,0.39,1.23
residual_sugar,3265.0,6.442,5.093,0.6,1.7,5.3,9.9,65.8
chlorides,3265.0,0.045,0.021,0.009,0.036,0.043,0.05,0.29
free_sulfur_dioxide,3265.0,35.447,17.321,2.0,23.0,34.0,46.0,289.0
total_sulfur_dioxide,3265.0,138.221,42.324,9.0,108.0,134.0,167.0,440.0
density,3265.0,0.994,0.003,0.987,0.992,0.994,0.996,1.039
ph,3265.0,3.186,0.151,2.74,3.08,3.18,3.28,3.82
sulphates,3265.0,0.489,0.114,0.22,0.41,0.47,0.55,1.08


In [91]:
# quais as correlações entre as características numéricas?
data.corr()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
fixed_acidity,1.0,-0.013,0.293,0.099,0.01,-0.04997,0.08,0.269,-0.4382,-0.016,-0.112
volatile_acidity,-0.013,1.0,-0.152,0.07,0.051,-0.09582,0.077,0.03,-0.01996,-0.036,0.09
citric_acid,0.293,-0.152,1.0,0.097,0.146,0.08185,0.116,0.158,-0.171,0.058,-0.085
residual_sugar,0.099,0.07,0.097,1.0,0.073,0.2729,0.379,0.836,-0.1979,-0.021,-0.425
chlorides,0.01,0.051,0.146,0.073,1.0,0.09208,0.192,0.252,-0.08321,0.012,-0.378
free_sulfur_dioxide,-0.05,-0.096,0.082,0.273,0.092,1.0,0.627,0.275,0.0006917,0.045,-0.247
total_sulfur_dioxide,0.08,0.077,0.116,0.379,0.192,0.627,1.0,0.513,0.02462,0.123,-0.442
density,0.269,0.03,0.158,0.836,0.252,0.2747,0.513,1.0,-0.09447,0.076,-0.76
ph,-0.438,-0.02,-0.171,-0.198,-0.083,0.0006917,0.025,-0.094,1.0,0.148,0.119
sulphates,-0.016,-0.036,0.058,-0.021,0.012,0.04539,0.123,0.076,0.1482,1.0,-0.013


In [92]:
# show variable correlation which is more than 0.6 (positive or negative)
corr = data.corr()
corr[corr != 1][abs(corr) > 0.6].dropna(how='all', axis=1).dropna(how='all', axis=0)

Unnamed: 0,residual_sugar,free_sulfur_dioxide,total_sulfur_dioxide,density,alcohol
residual_sugar,,,,0.836,
free_sulfur_dioxide,,,0.627,,
total_sulfur_dioxide,,0.627,,,
density,0.836,,,,-0.76
alcohol,,,,-0.76,


In [93]:
data.groupby('quality').mean()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
bad,6.889,0.281,0.337,6.724,0.048,35.795,141.8,0.994,3.179,0.486,10.263
good,6.742,0.267,0.325,5.422,0.038,34.192,125.316,0.992,3.209,0.498,11.445


In [102]:
spread = 0.5

data['outlier'] = False

for quality in ['good', 'bad']:
#for quality in np.arange(3, 10):
#if (True):

    #quality = 5
    dtqu = data[data.quality == quality].drop(['quality'], axis=1)
    
    Q1 = dtqu.quantile(0.25)
    Q3 = dtqu.quantile(0.75)
    IQR = Q3 - Q1
    
    removed_ids = ~((dtqu < (Q1 - spread * IQR)) | (dtqu > (Q3 + spread * IQR))).any(axis=1)
    df_rids = pd.DataFrame(data=removed_ids, columns=['outlier'])
    
    data.update(df_rids)
    
    #print(quality, dtqu[dtqu.outlier == True].count())
    print(quality, data[data.outlier == False].shape[0])

good 3088
bad 2630


In [103]:
removed_ids.head()

wine
4783    False
2303    False
4254    False
1593    False
4708     True
dtype: bool

In [104]:
data.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality,outlier
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4783,6.7,0.3,0.5,12.1,0.045,38.0,127.0,0.997,3.04,0.53,8.9,bad,False
2382,6.9,0.15,0.28,4.4,0.029,14.0,107.0,0.993,3.24,0.46,10.4,good,False
2303,6.9,0.44,0.18,11.8,0.051,26.0,126.0,0.998,3.23,0.48,9.1,bad,False
4254,7.5,0.29,0.24,9.9,0.058,25.0,115.0,0.996,3.15,0.46,10.9,bad,False
1593,8.6,0.16,0.49,7.3,0.043,9.0,63.0,0.995,3.13,0.59,10.5,bad,False


In [105]:
data[['ph','outlier']].groupby('outlier').count()

Unnamed: 0_level_0,ph
outlier,Unnamed: 1_level_1
False,2630
True,635


In [77]:
data_without_outliers = data[data.outlier == False].drop(['outlier'], axis=1)
data_without_outliers.shape

(2630, 12)

In [78]:
data_without_outliers.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4783,6.7,0.3,0.5,12.1,0.045,38.0,127.0,0.997,3.04,0.53,8.9,bad
2382,6.9,0.15,0.28,4.4,0.029,14.0,107.0,0.993,3.24,0.46,10.4,good
2303,6.9,0.44,0.18,11.8,0.051,26.0,126.0,0.998,3.23,0.48,9.1,bad
4254,7.5,0.29,0.24,9.9,0.058,25.0,115.0,0.996,3.15,0.46,10.9,bad
1593,8.6,0.16,0.49,7.3,0.043,9.0,63.0,0.995,3.13,0.59,10.5,bad


In [79]:
data_without_outliers.to_csv('wine-train-without-outliers.csv')

In [80]:
data = data_without_outliers
data.quality = data.quality.map({'good': 1, 'bad': 0})
data.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4783,6.7,0.3,0.5,12.1,0.045,38.0,127.0,0.997,3.04,0.53,8.9,0
2382,6.9,0.15,0.28,4.4,0.029,14.0,107.0,0.993,3.24,0.46,10.4,1
2303,6.9,0.44,0.18,11.8,0.051,26.0,126.0,0.998,3.23,0.48,9.1,0
4254,7.5,0.29,0.24,9.9,0.058,25.0,115.0,0.996,3.15,0.46,10.9,0
1593,8.6,0.16,0.49,7.3,0.043,9.0,63.0,0.995,3.13,0.59,10.5,0


In [81]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler.fit(data))

data_filtered_scaled = pd.DataFrame(scaler.transform(data), columns=data.columns, index=data.index)
data_filtered_scaled.head()

MinMaxScaler(copy=True, feature_range=(0, 1))


Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4783,0.354,0.238,0.407,0.176,0.128,0.125,0.274,0.198,0.278,0.36,0.149,0.0
2382,0.38,0.076,0.228,0.058,0.071,0.042,0.227,0.123,0.463,0.279,0.397,1.0
2303,0.38,0.389,0.146,0.172,0.149,0.084,0.271,0.2,0.454,0.302,0.182,0.0
4254,0.456,0.227,0.195,0.143,0.174,0.08,0.246,0.165,0.38,0.279,0.479,0.0
1593,0.595,0.086,0.398,0.103,0.121,0.024,0.125,0.158,0.361,0.43,0.413,0.0


In [82]:
data_filtered_scaled.to_csv('wine-train-filtered-scaled.csv')