<a href="https://colab.research.google.com/github/kankkw/229352-StatisticalLearning/blob/main/Lab03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Statistical Learning for Data Science 2 (229352)
#### Instructor: Donlapark Ponnoprat

#### [Course website](https://donlapark.pages.dev/229352/)

## Lab #4

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from scipy.stats import uniform

In [None]:
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')

Xtrain = train.data[:3000]
ytrain = train.target[:3000]
Xtest = test.data[:500]
ytest = test.target[:500]

print("X:", len(Xtest))
print("y:", len(ytest))

X: 7532
y: 7532


### Naive Bayes [(Documentation)](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score

nb_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("nb", MultinomialNB())
])

nb_pipeline.fit(Xtrain, ytrain)

y_pred = nb_pipeline.predict(Xtest)

print("Baseline Naive Bayes")
print("f1_macro:", f1_score(ytest, y_pred, average="macro"))
print(classification_report(ytest, y_pred))

### Random Search Cross-Validation [(Documentation)](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

### Uniform distribution in `Scipy` [(Documentation)](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.uniform.html)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {
    "nb__alpha": uniform(0.0001, 10)
}

random_search = RandomizedSearchCV(
    nb_pipeline,
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    scoring="f1_macro",
    n_jobs=-1,
    random_state=42,
    verbose=1
)

#### Exercise

1. For the Naive Bayes model, use grid search 5-fold cross-validation across different values of `alpha` to find the best model.

2. For the best value of `alpha`, compute the `f1_macro` score on the test set.
* What value of `alpha` did you obtain?
* What is the model's `f1_macro` score?

3. Repeat Exercise 1 and 2 for **random search** 5-fold cross validation across different values of `alpha`. Compute the `f1_macro` score on the test set.
* What value of `alpha` did you obtain?
* Did you get a better `f1_macro` score compared to grid search in Exercise 2?

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

alpha_grid = np.logspace(-3, 1, 9)

grid_search = GridSearchCV(
    nb_pipeline,
    param_grid={"nb__alpha": alpha_grid},
    cv=5,
    scoring="f1_macro",
    n_jobs=-1,
    verbose=1
)

grid_search.fit(Xtrain, ytrain)

best_alpha_grid = grid_search.best_params_["nb__alpha"]
best_cv_f1_grid = grid_search.best_score_

print("Grid Search")
print("Best alpha:", best_alpha_grid)
print("Best CV f1_macro:", best_cv_f1_grid)

y_pred_grid = grid_search.best_estimator_.predict(Xtest)
test_f1_grid = f1_score(ytest, y_pred_grid, average="macro")
print("Test f1_macro:", test_f1_grid)

random_search.fit(Xtrain, ytrain)

best_alpha_rand = random_search.best_params_["nb__alpha"]
best_cv_f1_rand = random_search.best_score_

print("\nRandom Search")
print("Best alpha:", best_alpha_rand)
print("Best CV f1_macro:", best_cv_f1_rand)

y_pred_rand = random_search.best_estimator_.predict(Xtest)
test_f1_rand = f1_score(ytest, y_pred_rand, average="macro")
print("Test f1_macro:", test_f1_rand)

print("\nDid Random Search outperform Grid Search?")
print(test_f1_rand > test_f1_grid)