# 08 — Intro to Machine Learning (Scikit-learn)

## Learning goals
- Understand train/test split
- Train a basic regression model
- Evaluate with MAE / R²

In [None]:
%pip install pandas scikit-learn numpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

# Synthetic study-hours dataset
df = pd.DataFrame({
    "hours": [1,2,3,4,5,6,7,8,9,10],
    "score": [50,55,60,63,68,72,78,84,88,93]
})

# Features and target
X = df[["hours"]]   # Must be 2D for sklearn
y = df["score"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Create and train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
pred = model.predict(X_test)

# Evaluation
print("MAE:", mean_absolute_error(y_test, pred))
print("R²:", r2_score(y_test, pred))
print("RMSE:", np.sqrt(((y_test - pred) ** 2).mean()))

# Model equation
print("Slope (Coefficient):", model.coef_[0])
print("Intercept:", model.intercept_)


MAE: 0.7508250825082546
R²: 0.9945611117397111
RMSE: 0.9937104103818908
Slope (Coefficient): 4.806930693069308
Intercept: 44.762376237623755


In [2]:
# Make a prediction
hours = [[7.5]]
predicted_score = model.predict(hours)[0]
print(f"Predicted score for 7.5 study hours: {predicted_score:.2f}")

Predicted score for 7.5 study hours: 80.81




In [3]:
# Practice task:
# Try a different test_size and compare MAE/R².# ...existing code...
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
import pandas as pd

results = []
for ts in [0.1, 0.2, 0.3, 0.4, 0.5]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=42)
    m = LinearRegression().fit(X_train, y_train)
    p = m.predict(X_test)
    results.append({"test_size": ts, "MAE": mean_absolute_error(y_test, p), "R2": r2_score(y_test, p)})

print(pd.DataFrame(results).to_string(index=False))
# ...existing code...

 test_size      MAE       R2
       0.1 0.274194      NaN
       0.2 0.500000 0.998802
       0.3 0.750825 0.994561
       0.4 0.862440 0.995594
       0.5 0.836364 0.996302


