# Lab 4: Data Quality Assessment & Preprocessing
## Task 1: Identify Data Quality Issues
Correcting data types for Date and Amount columns.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

sns.set(style='whitegrid')

df = pd.read_csv('Chocolate_Sales.csv')
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df['Amount'] = df['Amount'].replace(r'[\$,]', '', regex=True).astype(float)
print(df.dtypes)

## Task 2: Missing Value Strategy
Using **Median Imputation** because it is robust to outliers.

In [None]:
df_missing = df.copy()
df_missing.loc[0:5, 'Amount'] = np.nan
df_missing['Amount'] = df_missing['Amount'].fillna(df_missing['Amount'].median())
print('Missing values handled.')

## Task 3: Outlier Handling
Detecting outliers using the IQR method.

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x=df['Amount'])
plt.title('Outliers Detection')
plt.show()

Q1, Q3 = df['Amount'].quantile(0.25), df['Amount'].quantile(0.75)
IQR = Q3 - Q1
df_no_outliers = df[(df['Amount'] >= Q1 - 1.5 * IQR) & (df['Amount'] <= Q3 + 1.5 * IQR)]

## Task 4: Normalization & Standardization
Applying Min-Max and Z-score scaling.

In [None]:
std_scaler = StandardScaler()
df_std = std_scaler.fit_transform(df[['Amount', 'Boxes Shipped']])
print('Data Scaled Successfully.')

## Task 5: PCA Application
Applying PCA to reduce dimensions.

In [None]:
pca = PCA(n_components=2)
pcs = pca.fit_transform(df_std)
print('Explained Variance Ratio:', pca.explained_variance_ratio_)

plt.figure(figsize=(6,4))
plt.scatter(pcs[:,0], pcs[:,1], alpha=0.5)
plt.title('PCA Projection')
plt.show()