## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 3)
pd.set_option('display.max_columns', 100)

## Carga dos dados

In [3]:
# carregar arquivo de dados de treino
data = pd.read_csv('wine-train.csv', index_col='wine')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2169,7.4,0.19,0.3,12.8,0.053,48.5,229.0,0.999,3.14,0.49,9.1,good
1382,6.6,0.56,0.16,3.1,0.045,28.0,92.0,0.994,3.12,0.35,9.1,bad
3346,6.7,0.18,0.24,10.3,0.057,64.0,185.0,0.995,3.12,0.5,10.6,bad
3308,6.4,0.35,0.28,12.6,0.039,19.0,124.0,0.995,3.2,0.43,10.6,bad
3167,5.6,0.28,0.4,6.1,0.034,36.0,118.0,0.991,3.21,0.43,12.1,good


In [4]:
# quantas linhas e colunas existem?
data.shape

(3265, 12)

## Análise dos dados

In [5]:
# quais são as colunas e respectivos tipos de dados?
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3265 entries, 2169 to 4278
Data columns (total 12 columns):
fixed_acidity           3265 non-null float64
volatile_acidity        3265 non-null float64
citric_acid             3265 non-null float64
residual_sugar          3265 non-null float64
chlorides               3265 non-null float64
free_sulfur_dioxide     3265 non-null float64
total_sulfur_dioxide    3265 non-null float64
density                 3265 non-null float64
ph                      3265 non-null float64
sulphates               3265 non-null float64
alcohol                 3265 non-null float64
quality                 3265 non-null object
dtypes: float64(11), object(1)
memory usage: 318.8+ KB


In [6]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [7]:
data.isna().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
ph                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [8]:
# sumário estatístico das características numéricas
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed_acidity,3265.0,6.859,0.856,3.9,6.3,6.8,7.3,14.2
volatile_acidity,3265.0,0.277,0.1,0.08,0.21,0.26,0.32,1.1
citric_acid,3265.0,0.334,0.119,0.0,0.27,0.32,0.38,1.66
residual_sugar,3265.0,6.432,4.997,0.6,1.7,5.3,9.85,31.6
chlorides,3265.0,0.046,0.022,0.009,0.036,0.043,0.05,0.346
free_sulfur_dioxide,3265.0,35.656,17.437,2.0,24.0,34.0,46.0,289.0
total_sulfur_dioxide,3265.0,139.235,42.261,10.0,109.0,135.0,168.0,440.0
density,3265.0,0.994,0.003,0.987,0.992,0.994,0.996,1.01
ph,3265.0,3.19,0.149,2.72,3.09,3.18,3.28,3.8
sulphates,3265.0,0.492,0.114,0.22,0.41,0.48,0.55,1.01


In [9]:
# quais as correlações entre as características numéricas?
data.corr()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
fixed_acidity,1.0,-0.012,0.287,0.089,0.018,-0.036,0.112,0.281,-0.429,-0.026,-0.133
volatile_acidity,-0.012,1.0,-0.146,0.045,0.082,-0.097,0.089,0.008,-0.03,-0.033,0.062
citric_acid,0.287,-0.146,1.0,0.089,0.076,0.087,0.104,0.133,-0.162,0.078,-0.045
residual_sugar,0.089,0.045,0.089,1.0,0.109,0.29,0.399,0.833,-0.195,-0.039,-0.46
chlorides,0.018,0.082,0.076,0.109,1.0,0.106,0.204,0.272,-0.094,0.018,-0.363
free_sulfur_dioxide,-0.036,-0.097,0.087,0.29,0.106,1.0,0.609,0.283,-0.003,0.063,-0.221
total_sulfur_dioxide,0.112,0.089,0.104,0.399,0.204,0.609,1.0,0.537,-0.009,0.134,-0.445
density,0.281,0.008,0.133,0.833,0.272,0.283,0.537,1.0,-0.108,0.063,-0.803
ph,-0.429,-0.03,-0.162,-0.195,-0.094,-0.003,-0.009,-0.108,1.0,0.162,0.132
sulphates,-0.026,-0.033,0.078,-0.039,0.018,0.063,0.134,0.063,0.162,1.0,-0.02


In [10]:
# show variable correlation which is more than 0.6 (positive or negative)
corr = data.corr()
corr[corr != 1][abs(corr) > 0.7].dropna(how='all', axis=1).dropna(how='all', axis=0)

Unnamed: 0,residual_sugar,density,alcohol
residual_sugar,,0.833,
density,0.833,,-0.803
alcohol,,-0.803,


In [11]:
data.groupby('quality').mean()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
bad,6.892,0.281,0.336,6.75,0.048,35.855,142.833,0.994,3.183,0.489,10.257
good,6.74,0.263,0.328,5.296,0.038,34.947,126.379,0.992,3.217,0.5,11.409


In [12]:
spread = 1.5

data['outlier'] = False

for quality in ['good', 'bad']:
#for quality in np.arange(3, 10):
#if (True):

    #quality = 5
    dtqu = data[data.quality == quality].drop(['quality'], axis=1)
    
    Q1 = dtqu.quantile(0.25)
    Q3 = dtqu.quantile(0.75)
    IQR = Q3 - Q1
    
    removed_ids = ~((dtqu < (Q1 - spread * IQR)) | (dtqu > (Q3 + spread * IQR))).any(axis=1)
    df_rids = pd.DataFrame(data=removed_ids, columns=['outlier'])
    
    data.update(df_rids)
    
    #print(quality, dtqu[dtqu.outlier == True].count())
    print(quality, data[data.outlier == False].shape[0])

good 2691
bad 620


In [13]:
removed_ids.head()

wine
1382    False
3346     True
3308     True
1777     True
4721     True
dtype: bool

In [14]:
data.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality,outlier
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2169,7.4,0.19,0.3,12.8,0.053,48.5,229.0,0.999,3.14,0.49,9.1,good,False
1382,6.6,0.56,0.16,3.1,0.045,28.0,92.0,0.994,3.12,0.35,9.1,bad,False
3346,6.7,0.18,0.24,10.3,0.057,64.0,185.0,0.995,3.12,0.5,10.6,bad,True
3308,6.4,0.35,0.28,12.6,0.039,19.0,124.0,0.995,3.2,0.43,10.6,bad,True
3167,5.6,0.28,0.4,6.1,0.034,36.0,118.0,0.991,3.21,0.43,12.1,good,True


In [15]:
data[['ph','outlier']].groupby('outlier').count()

Unnamed: 0_level_0,ph
outlier,Unnamed: 1_level_1
False,620
True,2645


In [16]:
data_without_outliers = data[data.outlier == False].drop(['outlier'], axis=1)
data_without_outliers.shape

(620, 12)

In [17]:
data_without_outliers.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2169,7.4,0.19,0.3,12.8,0.053,48.5,229.0,0.999,3.14,0.49,9.1,good
1382,6.6,0.56,0.16,3.1,0.045,28.0,92.0,0.994,3.12,0.35,9.1,bad
3755,7.8,0.19,0.32,7.4,0.015,47.0,124.0,0.993,2.99,0.39,11.0,bad
1645,7.5,0.24,0.49,9.4,0.048,50.0,149.0,0.996,3.17,0.59,10.5,good
3379,7.6,0.36,0.49,11.3,0.046,87.0,221.0,0.998,3.01,0.43,9.2,bad


In [18]:
data_without_outliers.to_csv('wine-train-without-outliers.csv')

In [19]:
data = data_without_outliers
data.quality = data.quality.map({'good': 1, 'bad': 0})
data.head()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2169,7.4,0.19,0.3,12.8,0.053,48.5,229.0,0.999,3.14,0.49,9.1,1
1382,6.6,0.56,0.16,3.1,0.045,28.0,92.0,0.994,3.12,0.35,9.1,0
3755,7.8,0.19,0.32,7.4,0.015,47.0,124.0,0.993,2.99,0.39,11.0,0
1645,7.5,0.24,0.49,9.4,0.048,50.0,149.0,0.996,3.17,0.59,10.5,1
3379,7.6,0.36,0.49,11.3,0.046,87.0,221.0,0.998,3.01,0.43,9.2,0


In [20]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler.fit(data))

data_filtered_scaled = pd.DataFrame(scaler.transform(data), columns=data.columns, index=data.index)
data_filtered_scaled.head()

MinMaxScaler(copy=True, feature_range=(0, 1))


Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2169,0.34,0.09,0.181,0.39,0.131,0.159,0.509,0.489,0.389,0.316,0.108,1.0
1382,0.262,0.46,0.096,0.075,0.107,0.087,0.191,0.288,0.37,0.132,0.108,0.0
3755,0.379,0.09,0.193,0.214,0.018,0.154,0.265,0.235,0.25,0.184,0.45,0.0
1645,0.35,0.14,0.295,0.279,0.116,0.164,0.323,0.384,0.417,0.447,0.36,1.0
3379,0.359,0.26,0.295,0.341,0.11,0.294,0.491,0.48,0.269,0.237,0.126,0.0


In [21]:
data_filtered_scaled.to_csv('wine-train-filtered-scaled.csv')