# 08 — Intro to Machine Learning (Scikit-learn)

## Learning goals
- Understand train/test split
- Train a basic regression model
- Evaluate with MAE / R²

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Synthetic study-hours dataset
df = pd.DataFrame({
    "hours": [1,2,3,4,5,6,7,8,9,10],
    "score": [50,55,60,63,68,72,78,84,88,93]
})

X = df[["hours"]]
y = df["score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, pred))
print("R²:", r2_score(y_test, pred))

MAE: 0.7508250825082546
R²: 0.9945611117397111


In [3]:
# Make a prediction for a student who studies 7.5 hours
predicted_score = model.predict([[7.5]])
print(f"Predicted score for 7.5 hours of study: {predicted_score[0]:.2f}")

Predicted score for 7.5 hours of study: 80.81




In [4]:
# Practice task:
# Try a different test_size and compare MAE/R².
# Compare multiple test_size values and show MAE / R²
test_sizes = [0.2, 0.3, 0.4, 0.5]
results = []
for ts in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    results.append({'test_size': ts, 'MAE': mae, 'R2': r2})
import pandas as pd
pd.DataFrame(results)

Unnamed: 0,test_size,MAE,R2
0,0.2,0.5,0.998802
1,0.3,0.750825,0.994561
2,0.4,0.86244,0.995594
3,0.5,0.836364,0.996302
