In [1]:
import pandas as pd
import numpy as np

## Carregando dados

Aqui serão carregados os dados de um dataset do Scikit-Learn sobre notícias que contém 20 classes.

In [2]:
from sklearn import datasets

In [3]:
data = datasets.fetch_20newsgroups(subset='all')

In [4]:
data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
data['target_names']

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Limpeza dos textos

A limpeza dos textos é necessária para haver uma "normalização" dos textos.

- Coloca o texto em minúsculo;
- Remove caracteres não alfabéticos;
- Remove stopwords (palavras que se repetem muitas vezes e não agregam valor);
- Remove palavras muito curtas;
- Remove acentos.

Para este processo não existe um PADRÃO fixo a ser seguido, a limpeza deve se adaptar à natureza do problema.

In [6]:
X = data['data']
y = data['target']

In [7]:
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from unidecode import unidecode

stemmer = SnowballStemmer('english')
STOPWORDS = stopwords.words('english')

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*>', ' ', text)
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = ' '.join(stemmer.stem(token) for token in text.split() if 
                    token not in STOPWORDS and 
                    len(token) > 2)
    text = unidecode(text)
    return text.strip()
    

In [8]:
from tqdm import tqdm_notebook

X_clean = [None] * len(X)
for i, text in enumerate(tqdm_notebook(X)):
    X_clean[i] = clean_text(text)
    

HBox(children=(IntProgress(value=0, max=18846), HTML(value='')))




## Extração de features

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
vect = TfidfVectorizer(ngram_range=(1, 2), min_df=50, max_df=0.8, max_features=3000)
X_vect = vect.fit_transform(X_clean)

## Split de treino e teste

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.3, random_state=42)

## Classificação

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=2)

In [15]:
clf.fit(X_train.todense(), y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=2,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [16]:
y_pred = clf.predict(X_test.todense())

## Avaliação

In [17]:
from sklearn.metrics import classification_report, confusion_matrix

In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.75      0.79       236
           1       0.68      0.72      0.70       287
           2       0.74      0.82      0.78       290
           3       0.60      0.67      0.63       285
           4       0.83      0.71      0.77       312
           5       0.87      0.75      0.81       308
           6       0.75      0.79      0.77       276
           7       0.81      0.83      0.82       304
           8       0.92      0.92      0.92       279
           9       0.90      0.92      0.91       308
          10       0.90      0.93      0.91       309
          11       0.91      0.90      0.91       290
          12       0.66      0.65      0.66       304
          13       0.81      0.86      0.83       300
          14       0.85      0.91      0.88       297
          15       0.76      0.95      0.84       292
          16       0.81      0.89      0.85       270
          17       0.95    

In [19]:
print(confusion_matrix(y_test, y_pred))

[[177   1   0   1   1   0   1   0   0   0   2   1   1   1   2  24   1   5
    5  13]
 [  2 207  21  15   6   6   7   2   0   1   0   0  11   2   7   0   0   0
    0   0]
 [  0  11 239  18   3   4   0   2   2   0   0   1   6   2   2   0   0   0
    0   0]
 [  0  11  19 190   8  12  15   4   0   0   2   2  11   3   3   1   2   0
    1   1]
 [  0   5   6  39 223   1   9   2   0   3   1   1  14   3   3   0   0   0
    1   1]
 [  0  27  22   5   1 232   0   3   1   0   0   1   7   3   3   1   0   0
    1   1]
 [  0   4   3  18  11   0 217   5   3   1   2   1   5   1   1   0   1   0
    2   1]
 [  1   5   3   3   2   0   8 252  10   0   0   0  10   5   1   0   2   0
    1   1]
 [  1   1   0   1   0   0   7   6 256   0   1   1   2   0   0   1   2   0
    0   0]
 [  0   2   2   0   0   0   1   0   0 284  16   1   1   1   0   0   0   0
    0   0]
 [  1   0   0   0   1   1   2   2   0   9 286   0   1   4   1   1   0   0
    0   0]
 [  0   2   0   1   1   3   1   3   0   1   0 261   5   1   0   3