https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#extracting-features-from-text-files
<br>
<br>
https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py
<br>
<br>
https://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.neighbors import NearestCentroid

X, y = load_iris(return_X_y=True)
print(X.shape," " ,y.shape)

(150, 4)   (150,)


# Holdout

In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=42)
print(X_train.shape," " ,y_train.shape)
print(X_test.shape," " ,y_test.shape)

(105, 4)   (105,)
(45, 4)   (45,)


In [3]:
classificador_holdout = NearestCentroid().fit(X_train, y_train)
classificador_holdout.score(X_test, y_test)

0.9555555555555556

# k-Fold

In [4]:
from sklearn.model_selection import cross_val_score, cross_validate
classificador_crossvalidation = NearestCentroid().fit(X, y)

scores = cross_validate(classificador_crossvalidation, X, y, cv=10, scoring=['f1_micro', 'f1_macro'])

print(np.mean(scores['test_f1_micro']))
print(np.mean(scores['test_f1_macro']))

0.9333333333333333
0.9322390572390573


In [5]:
print("f1_micro: %0.2f (+/- %0.2f)" % (scores['test_f1_micro'].mean(), scores['test_f1_micro'].std() * 2))
print("f1_macro: %0.2f (+/- %0.2f)" % (scores['test_f1_macro'].mean(), scores['test_f1_macro'].std() * 2))

f1_micro: 0.93 (+/- 0.12)
f1_macro: 0.93 (+/- 0.12)


## Predizendo com k-Fold Crossvalidation

In [6]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

classificador_crossvalidation = NearestCentroid().fit(X_train, y_train)
predicted = cross_val_predict(classificador_crossvalidation, X_test, y_test, cv=10)
print(classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        19
          1       0.81      1.00      0.90        13
          2       1.00      0.77      0.87        13

avg / total       0.95      0.93      0.93        45



# Crossvalidation 20 NewsGroup

In [7]:
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
twenty_test  = fetch_20newsgroups(subset='test',  categories=categories, shuffle=True, random_state=42)

tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(twenty_train.data)

rocchio = NearestCentroid().fit(X_train_tfidf, twenty_train.target)

X_test_tfidf = tfidf_vect.fit_transform(twenty_test.data)

# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation:
predicted = cross_val_predict(rocchio, X_test_tfidf, twenty_test.target, cv=10)

y = twenty_test.target

print(classification_report(y, predicted,target_names=twenty_test.target_names))

# fig, ax = plt.subplots()
# ax.scatter(y, predicted, edgecolors=(0, 0, 0))
# ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
# ax.set_xlabel('Measured')
# ax.set_ylabel('Predicted')
# plt.show()

                        precision    recall  f1-score   support

           alt.atheism       0.91      0.74      0.82       319
         comp.graphics       0.75      0.94      0.84       389
               sci.med       0.83      0.80      0.81       396
soc.religion.christian       0.83      0.79      0.81       398

           avg / total       0.83      0.82      0.82      1502

