# Importing

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1.<b>Loading dataset<b/>

In [5]:
import pandas as pd

df = pd.read_csv("AirQualityUCI.csv", sep=";", decimal=",", na_values=-200)

# Clean up column names
df.columns = df.columns.str.strip()

print(df.columns.tolist())


['Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,,']


# 2. <b> Cleaning Data <b/>

In [6]:
# Drop completely empty columns
df = df.dropna(axis=1, how="all")

# Fill numeric missing values with mean
num_cols = df.select_dtypes(include="number").columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# Create a DateTime column
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], errors='coerce')

print(df.info())


KeyError: 'Date'

In [None]:
print(df.columns.tolist())


In [None]:
print(df.columns.tolist())

# 3.<b>Exploratory Data Analysis(EDA)<b/>

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


<b>Distribution<b/>

In [None]:
sns.histplot(df["CO(GT)"], bins=30, kde=True)
plt.title("Distribution of CO Levels")
plt.show()

<b>Correlation<b/>

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(numeric_only=True), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.show()

# 5.<b>Simple Prediction Model<b/>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X = df[["T", "RH", "AH"]] 
y = df["CO(GT)"]            
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Model R² Score:", model.score(X_test, y_test))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

## Day2

<b>Update of Visualization of Prediction<b/>

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual CO(GT)")
plt.ylabel("Predicted CO(GT)")
plt.title("Actual vs Predicted CO(GT) - Linear Regression")
plt.show()
