# Standard libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns

In [None]:
## dataset exploration
df = pd.read_csv('carseats.csv')

In [None]:
df.head( )

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df_numerical = df.select_dtypes(exclude=["object"])
df_categorical = df.select_dtypes(include=["object"])
print(df_categorical.head())

for i, col in enumerate(list(df_numerical.columns)):
    sns.displot(df[col], color = 'b', kde= True, label = df[col].name)
    plt.grid()
    plt.legend(loc="upper right")
    plt.tight_layout()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer

print(df.isnull().sum())

df.drop_duplicates(inplace = True)
df.dropna(inplace = True)

outliers_array = []
for name in df_numerical.columns.to_list():
    Q1 = df[name].quantile(0.05)
    Q3 = df[name].quantile(0.95)
    IQR = Q3 - Q1
    print(f'{name} IQR = {IQR}')
    logical_index_outliers = (df[name] < (Q1 - 1.5 * IQR)) |(df[name] > (Q3 + 1.5 * IQR))
    positives = np.where(logical_index_outliers == True)[0]
    print(f'{name} Outlier indexes: {positives}')

to_be_removed = ['Sales']

# log transform:
df['Advertising'] = np.log(df['Advertising'] + 1)

# ordinal encoding:
dictionary={"ShelveLoc": {"Bad":0, "Medium":1, "Good":2}}
df.replace(dictionary, inplace=True)

df['ShelveLoc'].isnull().sum()

In [None]:
numeric_features = ['CompPrice', 'Income', 'Advertising',
                    'Population','Price','Age','Education']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())])

categorical_features = ['Urban', 'US']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('categorical', OneHotEncoder())])

preprocessor = ColumnTransformer(
    remainder = 'passthrough',
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features),
        ('remove', 'drop', to_be_removed)
])
X = preprocessor.fit_transform(df)
y = df['Sales']
# start training models.


