In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("../data/data.csv")

In [None]:
df.head(3)

In [None]:
df.isna().sum() 

In [None]:
X = df.iloc[ : , :-1].values # Country, Age, Salary
y = df.iloc[ : , 3].values # Purshased
y

In [None]:
# Creates a SimpleImputer to transformate the missing values with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy="mean") 
imputer = imputer.fit(X[ : , 1:3])
X[ : , 1:3] = imputer.transform(X[ : , 1:3])

In [None]:
# Transforms the categorical values on the Country column to binary values
column_transformer = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), [0])],
    remainder='passthrough'
)
X = column_transformer.fit_transform(X)


# Converting the 'yes' or 'no' on 'Purchased' to 1 or 0
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)


In [11]:
# Train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [18]:
# Standardizes the data (mean = 0, std = 1)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)  
X_test = sc_X.transform(X_test)        

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.countplot(data=df, x='Country', hue='Purchased')
plt.title('Compras por País')

plt.subplot(1, 2, 2)
sns.boxplot(data=df, x='Purchased', y='Age')
plt.title('Distribuição de Idade por Compra')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(data=df, x='Age', hue='Purchased', kde=True)
plt.title('Distribuição de Idade')

plt.subplot(1, 2, 2)
sns.histplot(data=df, x='Salary', hue='Purchased', kde=True)
plt.title('Distribuição de Salário')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.decomposition import PCA

# Converta X_train de volta para DataFrame para facilitar a visualização
# (Note que os nomes das colunas serão genéricos após transformação)
X_train_df = pd.DataFrame(X_train)

# Heatmap de correlação
plt.figure(figsize=(8, 6))
sns.heatmap(X_train_df.corr(), annot=True, cmap='coolwarm')
plt.title('Matriz de Correlação após Pré-processamento')
plt.show()

# PCA para visualização 2D
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=Y_train)
plt.title('Visualização PCA dos Dados Transformados')
plt.xlabel('Componente Principal 1')
plt.ylabel('Componente Principal 2')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=pd.DataFrame(X_train))
plt.title('Distribuição das Features após Standard Scaling')
plt.xticks(rotation=45)
plt.show()