# Wine Quality Analysis

This Jupyter notebook replicates the analysis from `Wine_Quality_Analysis.R`, fetching, cleaning, visualizing, performing PCA, and regression modeling on the UCI wine quality datasets (red & white).

## 1. Install & load packages

In [None]:
pkgs <- c("tidyverse","GGally","corrplot","factoextra")
for(pkg in pkgs){
  if(!requireNamespace(pkg, quietly=TRUE)) install.packages(pkg)
  library(pkg, character.only=TRUE)
}

## 2. Read & prepare data

In [None]:
url_red   <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
url_white <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"

wine_spec <- cols(
  `fixed acidity`          = col_double(),
  `volatile acidity`       = col_double(),
  `citric acid`            = col_double(),
  `residual sugar`         = col_double(),
  chlorides                 = col_double(),
  `free sulfur dioxide`    = col_double(),
  `total sulfur dioxide`   = col_double(),
  density                   = col_double(),
  pH                        = col_double(),
  sulphates                 = col_double(),
  alcohol                   = col_double(),
  quality                   = col_integer()
)

red   <- read_delim(url_red, delim=";", col_types = wine_spec, locale = locale(decimal_mark=".")) %>% mutate(type="red")
white <- read_delim(url_white, delim=";", col_types = wine_spec, locale = locale(decimal_mark=".")) %>% mutate(type="white")

wine <- bind_rows(red, white) %>%
  mutate(type=factor(type), quality=factor(quality, ordered=TRUE))

cat("Total observations:", nrow(wine), "\n")
cat("Red / White counts:\n"); print(table(wine$type))

## 3. Exploratory Plots

In [None]:
# 3.1 Quality counts by type
p1 <- ggplot(wine, aes(quality, fill=type)) +
  geom_bar(position="dodge") +
  labs(title="Wine Quality Counts by Type", x="Quality", y="Count") +
  theme_minimal()
print(p1)

# 3.2 Alcohol vs. Quality
p2 <- ggplot(wine, aes(alcohol, quality, colour=type)) +
  geom_jitter(width=0, height=0.2, alpha=0.5) +
  labs(title="Alcohol Content vs. Quality", x="Alcohol (%)", y="Quality") +
  theme_minimal()
print(p2)

# 3.3 Boxplots of key acids by quality
attrs <- c("pH","residual sugar","citric acid","sulphates")
p3 <- wine %>%
  pivot_longer(all_of(attrs), names_to="attribute", values_to="value") %>%
  ggplot(aes(quality, value, fill=quality)) +
  geom_boxplot() +
  facet_wrap(~attribute, scales="free_y") +
  labs(title="Chemical Attributes by Wine Quality") +
  theme_minimal() +
  theme(legend.position="none")
print(p3)

# 3.4 Correlation heatmap
nums <- wine %>% select_if(is.numeric)
corr <- cor(nums, use="pairwise.complete.obs")
corrplot(corr, method="color", type="upper", tl.col="black", tl.cex=0.8)

## 4. PCA

In [None]:
scaled <- scale(nums)
res.pca <- prcomp(scaled, center=TRUE, scale.=FALSE)
fviz_screeplot(res.pca, addlabels=TRUE, title="PCA: Variance Explained")
fviz_pca_biplot(res.pca, repel=TRUE, title="PCA Biplot: Wines")

## 5. Regression Modeling

In [None]:
wine$num_quality <- as.numeric(as.character(wine$quality))
mod <- lm(num_quality ~ alcohol + sulphates + pH + type, data=wine)
cat("\n=== Regression Summary ===\n")
print(summary(mod))
plot(mod, which=1)

## 6. Fitted vs Actual

In [None]:
wine$fitted <- fitted(mod)
p4 <- ggplot(wine, aes(fitted, num_quality, colour=type)) +
  geom_jitter(alpha=0.4, height=0.1) +
  geom_abline(slope=1, intercept=0, linetype="dashed") +
  labs(title="Fitted vs. Actual Quality", x="Fitted quality", y="Actual quality") +
  theme_minimal()
print(p4)