In [25]:
import sys
import os
from dotenv import load_dotenv

#1. load environment variables and data

# load environment variables
load_dotenv()

#add working directory to sys path to execute utils/dataset.py
working_dir = os.environ.get("WORKING_DIRECTORY")
sys.path.insert(0, working_dir)

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils.dataset import get_data 
df = get_data()
df.head(10)

Loading data from wines: 8000it [00:00, 9205.59it/s]


Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,magnesium,flavanoids,minerals,calcium,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,Pinot noir,5.8,0.15,0.49,1.1,76.729301,894.94,186.639301,109.91,0.048,21.0,98.0,0.9929,3.19,0.48,9.2,5
1,Merlot,6.6,0.25,0.32,5.6,4.795712,1160.95,251.875712,247.08,0.039,15.0,68.0,0.99163,2.96,0.52,11.1,6
2,Chardonnay,6.7,0.21,0.34,1.5,85.19371,789.82,304.70371,219.51,0.035,45.0,123.0,0.98949,3.24,0.36,12.6,7
3,Merlot,8.3,0.28,0.27,17.5,11.976525,777.86,237.586525,225.61,0.045,48.0,253.0,1.00014,3.02,0.56,9.1,6
4,Merlot,7.5,0.42,0.19,6.9,5.599673,785.72,95.399673,89.8,0.041,62.0,150.0,0.99508,3.23,0.37,10.0,6
5,Merlot,7.3,0.34,0.3,1.3,22.403749,1044.95,289.523749,267.12,0.057,25.0,173.0,0.9948,3.26,0.51,9.1,6
6,Merlot,7.6,0.21,0.49,2.5,23.875866,888.61,133.545866,109.67,0.047,20.0,130.0,0.99178,3.15,0.48,11.1,5
7,Chardonnay,6.0,0.25,0.4,5.7,23.309699,1381.79,266.529699,243.22,0.052,56.0,152.0,0.99398,3.16,0.88,10.5,6
8,Cabernet Sauvignon,6.7,0.18,0.19,4.7,49.165745,1456.41,269.915745,220.75,0.046,57.0,161.0,0.9946,3.32,0.66,10.5,6
9,Gamay,7.7,0.28,0.39,8.9,54.450579,929.44,377.690579,323.24,0.036,8.0,117.0,0.9935,3.06,0.38,12.0,2


In [48]:
#import pipeline from scikit
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[])

# create a list of numerical and a list of categorical features for df

#numerical features
numerical_features = df.select_dtypes(include=[np.number]).columns

#categorical features
categorical_features = df.select_dtypes(include=['object']).columns

# add imputation to numerical pipeline and categorical pipeline

#numerical pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# apply feature scaling to numerical features 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler() )
])

#categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# let the pipeline know which features to use for numerical and categorical pipeline

#pipeline
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
).set_output(transform="pandas")

# append preprocessor to existing pipeline

pipeline.steps.append(('preprocessor', preprocessor))
end_data = pipeline.fit_transform(df)

end_data

Unnamed: 0,num__fixed acidity,num__volatile acidity,num__citric acid,num__residual sugar,num__magnesium,num__flavanoids,num__minerals,num__calcium,num__chlorides,num__free sulfur dioxide,...,num__density,num__pH,num__sulphates,num__alcohol,num__quality,cat__wine type_Cabernet Sauvignon,cat__wine type_Chardonnay,cat__wine type_Gamay,cat__wine type_Merlot,cat__wine type_Pinot noir
0,-1.255384,-1.281148,1.290237,-1.039875,0.942709,-0.146092,-1.288574,-1.774065,0.095295,-0.831301,...,-0.378969,0.018157,-0.092029,-1.065351,-0.178350,0.0,0.0,0.0,0.0,1.0
1,-0.304898,-0.284039,-0.124109,-0.157796,-1.531736,0.837905,-0.439564,0.150910,-0.307785,-1.182793,...,-0.798974,-1.511513,0.257892,0.482767,-0.064151,0.0,0.0,0.0,1.0,0.0
2,-0.186087,-0.682883,0.042285,-0.961468,1.233876,-0.534942,0.247959,-0.235994,-0.486932,0.574668,...,-1.506699,0.350693,-1.141794,1.704966,0.050048,0.0,1.0,0.0,0.0,0.0
3,1.714885,0.015094,-0.540093,2.174811,-1.284723,-0.579183,-0.625528,-0.150389,-0.039065,0.750414,...,2.015390,-1.112469,0.607814,-1.146831,-0.064151,0.0,0.0,0.0,1.0,0.0
4,0.764399,1.411047,-1.205668,0.097026,-1.504081,-0.550108,-2.475999,-2.056279,-0.218212,1.570562,...,0.341984,0.284186,-1.054314,-0.413512,-0.064151,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-0.542520,-0.483461,0.291875,0.293044,1.492151,0.772690,0.431308,-0.143653,0.184868,1.453398,...,0.275842,-2.176587,-0.004549,-0.983872,-0.064151,1.0,0.0,0.0,0.0,0.0
7996,0.170345,-0.583172,-0.623290,0.547866,1.564597,-0.263058,0.474464,-0.126673,-0.845225,0.106011,...,-0.584011,-0.846440,-1.316755,1.623486,0.164246,1.0,0.0,0.0,0.0,0.0
7997,0.764399,-0.184328,-0.290503,-0.353814,0.027116,-0.380134,0.978959,1.044563,-0.845225,-0.362645,...,-1.056930,-0.247873,-0.966833,1.216087,0.050048,0.0,0.0,0.0,1.0,0.0
7998,-0.661330,1.510758,-0.124109,0.469459,1.220528,1.353448,0.563062,0.109230,-0.173425,-1.007047,...,-0.769210,0.616723,-1.404235,1.949406,0.050048,0.0,1.0,0.0,0.0,0.0
