In [187]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Function to detect outliers using IQR
def detect_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    # Define bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Return True for outliers
    return (data < lower_bound) | (data > upper_bound)

1. Carga y Exploración de Datos

In [188]:
df = pd.read_csv('../../../data/WineQT.csv')
df.head()
#df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4


In [189]:
# Se exploran los tipos de datos de cada columna
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   int64  
 12  Id                    1143 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 116.2 KB


In [190]:
# se exploran las caracteristicas descriptivas de los datos
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1143.0,8.311111,1.747595,4.6,7.1,7.9,9.1,15.9
volatile acidity,1143.0,0.531339,0.179633,0.12,0.3925,0.52,0.64,1.58
citric acid,1143.0,0.268364,0.196686,0.0,0.09,0.25,0.42,1.0
residual sugar,1143.0,2.532152,1.355917,0.9,1.9,2.2,2.6,15.5
chlorides,1143.0,0.086933,0.047267,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1143.0,15.615486,10.250486,1.0,7.0,13.0,21.0,68.0
total sulfur dioxide,1143.0,45.914698,32.78213,6.0,21.0,37.0,61.0,289.0
density,1143.0,0.99673,0.001925,0.99007,0.99557,0.99668,0.997845,1.00369
pH,1143.0,3.311015,0.156664,2.74,3.205,3.31,3.4,4.01
sulphates,1143.0,0.657708,0.170399,0.33,0.55,0.62,0.73,2.0


In [191]:
#Tratamiento de valores nulos
qsna=df.shape[0]-df.isnull().sum(axis=0)
qna=df.isnull().sum(axis=0)
ppna=round(100*(df.isnull().sum(axis=0)/df.shape[0]),2)
aux= {'datos sin NAs en q': qsna, 'Na en q': qna ,'Na en %': ppna}
na=pd.DataFrame(data=aux)
na.sort_values(by='Na en %',ascending=False)


Unnamed: 0,datos sin NAs en q,Na en q,Na en %
fixed acidity,1143,0,0.0
volatile acidity,1143,0,0.0
citric acid,1143,0,0.0
residual sugar,1143,0,0.0
chlorides,1143,0,0.0
free sulfur dioxide,1143,0,0.0
total sulfur dioxide,1143,0,0.0
density,1143,0,0.0
pH,1143,0,0.0
sulphates,1143,0,0.0


* Deteccion de outliers

In [192]:
# Deteccion usando metodo Z-Score

numeric_cols = df.drop(['quality', 'Id'], axis=1).columns

print("Detección de Outliers usando Z-score:")
for column in numeric_cols:
    z_scores = np.abs(stats.zscore(df[column]))
    outliers = df[z_scores > 3][column]  # Consideramos outliers los valores con |z| > 3
    if len(outliers) > 0:
        print(f"\nOutliers en {column}:")
        print(f"Número de outliers: {len(outliers)}")
        print(f"Valores outlier: {outliers.values}")

Detección de Outliers usando Z-score:

Outliers en fixed acidity:
Número de outliers: 9
Valores outlier: [15.  15.  13.8 13.7 15.6 14.3 15.5 15.6 15.9]

Outliers en volatile acidity:
Número de outliers: 5
Valores outlier: [1.33 1.33 1.09 1.58 1.18]

Outliers en citric acid:
Número de outliers: 1
Valores outlier: [1.]

Outliers en residual sugar:
Número de outliers: 23
Valores outlier: [ 7.3  7.2 11.  11.   7.9  7.9  6.7  6.6 15.5  8.3  7.9  8.6  7.5  6.6
  9.   8.8  8.1  8.3  8.3 15.4 13.8 13.8  6.7]

Outliers en chlorides:
Número de outliers: 21
Valores outlier: [0.341 0.332 0.467 0.61  0.27  0.337 0.263 0.611 0.358 0.25  0.422 0.387
 0.415 0.241 0.414 0.403 0.414 0.415 0.415 0.235 0.23 ]

Outliers en free sulfur dioxide:
Número de outliers: 13
Valores outlier: [68. 68. 53. 52. 51. 48. 48. 51. 52. 55. 48. 48. 66.]

Outliers en total sulfur dioxide:
Número de outliers: 10
Valores outlier: [165. 151. 149. 147. 145. 148. 152. 278. 289. 147.]

Outliers en density:
Número de outliers: 12
V

* Tratamiento de outliers

In [193]:
# Se usa el metodo clipping (Recortar en vez de eliminar)

df_cleaned = df.copy()
numeric_cols = df.drop(['quality', 'Id'], axis=1).columns
for column in numeric_cols:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df_cleaned[column] = df_cleaned[column].clip(lower=lower_bound, upper=upper_bound)

print(f"Registros originales: {len(df)}")
print(f"Registros después del tratamiento: {len(df_cleaned)}")

Registros originales: 1143
Registros después del tratamiento: 1143


* Preprocesamiento y escalada

In [194]:
# Separacion de caracteristicas y variables objetivo

X = df.drop(['quality', 'Id'], axis=1)
y = df['quality']

#Division de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Escalamiento de características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



* Entrenamiento

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Realizar predicciones
y_pred = knn.predict(X_test)

# Evaluar el modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Exactitud: {accuracy}')
print('Informe de Clasificación:')
print(classification_report(y_test, y_pred))

# Matriz de Confusión
conf_matrix = confusion_matrix(y_test, y_pred)
print('Matriz de Confusión:')
print(conf_matrix)


model2 = RandomForestClassifier(random_state=1)
model2.fit(X_train, Y_train)
y_pred2 = model2.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy Score:",accuracy_score(Y_test,y_pred2))

Exactitud: 0.5021834061135371
Informe de Clasificación:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         6
           5       0.55      0.66      0.60        96
           6       0.51      0.41      0.46        99
           7       0.44      0.42      0.43        26
           8       0.00      0.00      0.00         2

    accuracy                           0.50       229
   macro avg       0.25      0.25      0.25       229
weighted avg       0.50      0.50      0.50       229

Matriz de Confusión:
[[ 0  0  0  0  0  0]
 [ 0  0  3  2  1  0]
 [ 0  3 63 29  1  0]
 [ 0  5 41 41 12  0]
 [ 1  0  6  7 11  1]
 [ 0  0  1  1  0  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
