In [None]:
# =========================
# Step 1: Import Libraries
# =========================
import pandas as pd
import matplotlib.pyplot as plt

# Optional: show plots inside notebook
%matplotlib inline

# =========================
# Step 2: Load Dataset
# =========================
df = pd.read_csv('../data/iris.csv')  # Adjust path if needed
print("First 5 rows of dataset:")
display(df.head())

print("\nDataset info:")
df.info()

print("\nMissing values in each column:")
print(df.isnull().sum())

# =========================
# Step 3: Clean the Data
# =========================
# Drop rows with missing values (if any)
df = df.dropna()

# Alternatively, you could fill missing numeric values with mean:
# df['sepal_length'] = df['sepal_length'].fillna(df['sepal_length'].mean())
# Fill missing categorical values with mode:
# df['species'] = df['species'].fillna(df['species'].mode()[0])

print("\nAfter cleaning, missing values per column:")
print(df.isnull().sum())

# =========================
# Step 4: Basic Data Analysis
# =========================
print("\nSummary statistics of numerical columns:")
display(df.describe())

# Group by species and compute mean of numerical columns
grouped = df.groupby('species').mean()
print("\nMean values per species:")
display(grouped)

# Observations (can also write as text in notebook)
print("\nObservations:")
print("- Virginica has the longest petals on average.")
print("- Setosa has the shortest petals and smallest sepals.")
print("- Sepal length and petal length appear positively correlated.")

# =========================
# Step 5: Data Visualization
# =========================

# 1️⃣ Line Chart: Sepal Length trend over index
plt.figure(figsize=(8,5))
plt.plot(df.index, df['sepal_length'], marker='o')
plt.title('Sepal Length Trend')
plt.xlabel('Index')
plt.ylabel('Sepal Length')
plt.show()

# 2️⃣ Bar Chart: Average Petal Length per Species
avg_petal_length = df.groupby('species')['petal_length'].mean()
avg_petal_length.plot(kind='bar', color='skyblue')
plt.title('Average Petal Length per Species')
plt.xlabel('Species')
plt.ylabel('Average Petal Length')
plt.show()

# 3️⃣ Histogram: Distribution of Sepal Width
plt.hist(df['sepal_width'], bins=10, color='green', edgecolor='black')
plt.title('Distribution of Sepal Width')
plt.xlabel('Sepal Width')
plt.ylabel('Frequency')
plt.show()

# 4️⃣ Scatter Plot: Sepal Length vs Petal Length
plt.scatter(df['sepal_length'], df['petal_length'], c='red')
plt.title('Sepal Length vs Petal Length')
plt.xlabel('Sepal Length')
plt.ylabel('Petal Length')
plt.show()

