# 08 — Intro to Machine Learning (Scikit-learn)

## Learning goals
- Understand train/test split
- Train a basic regression model
- Evaluate with MAE / R²

In [3]:
pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Downloading scipy-1.17.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.8.0-cp313-cp313-win_amd64.whl (8.0 MB)
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.0 MB ? eta -:--:--
   --- ------------------------------------ 0.8/8.0 MB 1.6 MB/s eta 0:00:05
  


[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Synthetic study-hours dataset
df = pd.DataFrame({
    "hours": [1,2,3,4,5,6,7,8,9,10],
    "score": [50,55,60,63,68,72,78,84,88,93]
})

X = df[["hours"]]
y = df["score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, pred))
print("R²:", r2_score(y_test, pred))

MAE: 0.7508250825082546
R²: 0.9945611117397111


In [5]:
# Make a prediction
hours = [[7.5]]
predicted_score = model.predict(hours)[0]
print(f"Predicted score for 7.5 study hours: {predicted_score:.2f}")

Predicted score for 7.5 study hours: 80.81




In [6]:
# Practice task:
# Try a different test_size and compare MAE/R².

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Ensure X, y exist; fall back to the synthetic dataset if not
try:
    X
    y
except NameError:
    import pandas as pd
    df = pd.DataFrame({
        "hours": [1,2,3,4,5,6,7,8,9,10],
        "score": [50,55,60,63,68,72,78,84,88,93],
    })
    X = df[["hours"]]
    y = df["score"]

test_sizes = [0.1, 0.2, 0.3, 0.4, 0.5]
print("test_size\tMAE\tR^2")
for ts in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=42)
    model = LinearRegression().fit(X_train, y_train)
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    print(f"{ts:.1f}\t\t{mae:.3f}\t{r2:.3f}")

test_size	MAE	R^2
0.1		0.274	nan
0.2		0.500	0.999
0.3		0.751	0.995
0.4		0.862	0.996
0.5		0.836	0.996


