# Simple ML Pipeline for Docker Learning

This notebook demonstrates a basic machine learning workflow including:
1. Dataset downloading
2. Data transformation with Pandas
3. Model fitting with Scikit-Learn

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# 1. Download Dataset
# We'll use the classic Iris dataset hosted on GitHub to simulate downloading data
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"
print(f"Downloading data from {url}...")
df = pd.read_csv(url)

print("Dataset shape:", df.shape)
df.head()

In [None]:
# 2. Pandas Transformations

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Feature Engineering: Create a new feature 'petal_area'
df['petal_area'] = df['petal_length'] * df['petal_width']

# Simple stats
print("\nBasic Statistics:")
print(df.describe())

In [None]:
# 3. Simple Model Fitting

# Prepare features and target
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'petal_area']]
y = df['species']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train model
model = RandomForestClassifier(n_estimators=10, random_state=42)
model.fit(X_train, y_train)

# Evaluate
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, predictions))