# Preprocessing articles
* Lemmatization (nltk)
* Stop Words (nltk)
* Stemming (nltk)
http://scikit-learn.org/stable/datasets/twenty_newsgroups.html#newsgroups

In [0]:
# Estos dos comandos evitan que haya que hacer reload cada vez que se modifica un paquete
%load_ext autoreload
%autoreload 2

In [0]:
#Librerias generales
import numpy as np
import time
import os
import pickle

#Paquetes para manejo de datos
import pandas         as pd
import dask.dataframe as dd

#Paquetes de nltk para preprocesamiento
import nltk
from   nltk.tokenize import TreebankWordTokenizer
from   nltk.stem     import PorterStemmer, WordNetLemmatizer
from   nltk.corpus   import stopwords

#Paquetes de sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection         import train_test_split
from sklearn.model_selection         import cross_val_score
from sklearn.naive_bayes             import MultinomialNB

#dataset a utilizar
from sklearn.datasets import fetch_20newsgroups

### Obtain twentynewsgroup as csv

In [3]:
def twenty_newsgroup_to_csv():
    newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True)

    df = pd.DataFrame([newsgroups_train.data, newsgroups_train.target.tolist()]).T
    df.columns = ['text', 'target']
    print(df.head(-10))
    df.to_csv('20_newsgroup.csv')
    return df
    
dataset = twenty_newsgroup_to_csv() # las columnas del df son text y target

                                                    text target
0      From: lerxst@wam.umd.edu (where's my thing)\nS...      7
1      From: guykuo@carson.u.washington.edu (Guy Kuo)...      4
2      From: twillis@ec.ecn.purdue.edu (Thomas E Will...      4
3      From: jgreen@amber (Joe Green)\nSubject: Re: W...      1
4      From: jcm@head-cfa.harvard.edu (Jonathan McDow...     14
...                                                  ...    ...
11299  From: 2120788@hydra.maths.unsw.EDU.AU ()\nSubj...     17
11300  From: aa888@freenet.carleton.ca (Mark Baker)\n...     15
11301  From: zmed16@trc.amoco.com (Michael)\nSubject:...      6
11302  From: rdippold@qualcomm.com (Ron "Asbestos" Di...     11
11303  From: bchuang@css.itd.umich.edu (Ben Chuang)\n...      4

[11304 rows x 2 columns]


In [4]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

tokenizer  = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

random_seed = 0
test_size   = 0.3
cross_sets  = 5

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Caching

In [0]:
caching      = True
dataset_path = '20_newsgroup.csv'

def get_nltk_cache_path(hp):
    cache_path = f'cache-{hp["isalpha"]}'
    return cache_path

def get_sklearn_cache_path(hp):
    cache_path = f'cache-{hp["isalpha"]}-{hp["tf_idf"]}-{hp["min_df"]}-{hp["max_df"]}'
    return cache_path

## Hyperparameters

In [0]:
hyperparameters_specs = {
    'isalpha': [True, False],
    'tf_idf':  [True, False],
    'min_df':  list(np.linspace(0.01, 0.1, num=6)),#[0.1, 0.05, 0.1, 0.2],
    'max_df':  list(np.linspace(0.7, 0.9, num=6)),#[0.5, 0.6, 0.7, 0.8],
    'alpha':   [0.01, 0.1, 1.0, 10.0],
}

### Hyperparameters to dataframe

In [7]:
#Guardaremos todo en un dataFrame de Pandas
hyperparameters = pd.DataFrame()

for isalpha in hyperparameters_specs['isalpha']:
    for tf_idf in hyperparameters_specs['tf_idf']:
        for min_df in hyperparameters_specs['min_df']:
            for max_df in hyperparameters_specs['max_df']:
                for alpha in hyperparameters_specs['alpha']:
                    hp = {
                        'isalpha': isalpha,
                        'alpha':   alpha,
                        'min_df':  min_df,
                        'max_df':  max_df,
                        'tf_idf':  tf_idf,
                    }
                    hp_pandas = pd.DataFrame(hp, index=[0])
                    hyperparameters = hyperparameters.append(hp_pandas,ignore_index=True)

#Veamos como quedo
print(hyperparameters.head(-5))

     isalpha  alpha  min_df  max_df  tf_idf
0       True   0.01    0.01    0.70    True
1       True   0.10    0.01    0.70    True
2       True   1.00    0.01    0.70    True
3       True  10.00    0.01    0.70    True
4       True   0.01    0.01    0.74    True
..       ...    ...     ...     ...     ...
566    False   1.00    0.10    0.82   False
567    False  10.00    0.10    0.82   False
568    False   0.01    0.10    0.86   False
569    False   0.10    0.10    0.86   False
570    False   1.00    0.10    0.86   False

[571 rows x 5 columns]


### Preprocesamiento: NLTK

In [0]:
#Callback para el procesamiento paralelo de Dask
def nltk_preprocessor_callback(**kwargs):
    #Preprocesamiento con NLTK igual que en la clase anterior
    def preprocessor(datapoint):
        raw_datapoint          = datapoint
        tokenized_datapoint    = tokenizer.tokenize(raw_datapoint)
        lemmatized_datapoint   = [lemmatizer.lemmatize(x,pos='v') for x in tokenized_datapoint]
        nonstop_datapoint      = [x for x in lemmatized_datapoint if x not in stopwords.words('english')]
        stemmed_datapoint      = [stemmer.stem(x) for x in nonstop_datapoint]
        filtered_datapoint     = stemmed_datapoint
        
        #Salteamos esto dependiendo del hiperparámetro isalpha
        if kwargs.setdefault('isalpha', True):
            alphanumeric_datapoint = [x for x in stemmed_datapoint if x.isalpha()]
            filtered_datapoint     = alphanumeric_datapoint
        
        return ' '.join(filtered_datapoint)

    return preprocessor

def run_nltk_preprocessor(hp, dataset=None):
    print('NLTK Preprocessing...')
    to = time.time()
    cache_path = get_nltk_cache_path(hp)
    
    #Checkeamos si ya se corrió el preprocesamiento para esta combinación de hiperparámetros
    if not (os.path.exists(cache_path) and os.path.isfile(cache_path)):
        print('Cache miss: ', cache_path)

        #Leemos el dataset
        if caching is True:
            dataset = pd.read_csv(dataset_path)
        else:
            dataset = dataset.copy()
        preprocessor    = nltk_preprocessor_callback(isalpha=hp['isalpha'])
        ddataset        = dd.from_pandas(dataset, npartitions=os.cpu_count())
        dataset['text'] = ddataset['text'].map_partitions(lambda df: df.apply(preprocessor)). compute(scheduler='multiprocessing')
        
        #Guardamos en la cache este intento
        if caching is True:
            cache_path = get_nltk_cache_path(hp)
            with open(cache_path, 'wb') as fp:
                pickle.dump(dataset, fp)
        
    tf = time.time()
    print('finished in', (int(tf-to)), 'seconds.')

In [9]:
for idx,hyperParam in hyperparameters.iterrows():
    break
run_nltk_preprocessor(hyperParam)

NLTK Preprocessing...
finished in 0 seconds.


In [0]:
#186*710 = segs totales = 2200 min = 36.68 horas 
# 710 segs me tomo la primera (local)
# 489 con colab 12 gb ,
# 255 con colab 25 gb,
#     con colab 25 gb//

### Preprocesamiento: sklearn

In [0]:
def run_sklearn_preprocessor(hp, dataset=None):
    print('sklearn preprocessing...')
    to = time.time()
    cache_path = get_sklearn_cache_path(hp)
    
    #Checkeamos si ya intentamos con esta combinación
    if not (os.path.exists(cache_path) and os.path.isfile(cache_path)):    
        print('Cache miss: ', cache_path)   
        
        if caching is True:
            cache_path = get_nltk_cache_path(hp)
            with open (cache_path, 'rb') as fp:
                dataset = pickle.load(fp)
        else:
            dataset = dataset.copy()

        #Corremos el vectorizer que corresponde, igual que en clase anterior
        V = (TfidfVectorizer if hp['tf_idf'] is True else CountVectorizer)(min_df=hp['min_df'], max_df=hp['max_df'])
        X = V.fit_transform(dataset['text']).toarray()
        Y = np.array([dataset['target'].values]).T
        D = np.hstack((X, Y))

        np.random.seed(seed=random_seed)
        np.random.shuffle(D)

        if caching is True:
            cache_path = get_sklearn_cache_path(hp)
            with open(cache_path, 'wb') as fp:
                pickle.dump(D, fp)

    tf = time.time()
    print('finished in', (int(tf-to)), 'seconds.')

In [12]:
for idx,hp2 in hyperparameters.iterrows():
    break
run_sklearn_preprocessor(hp2)

sklearn preprocessing...
Cache miss:  cache-True-True-0.01-0.7
finished in 1 seconds.


In [13]:
print('Preprocessing dataset...')
for index, hp in hyperparameters.iterrows():
    print(hp.to_dict())
    run_nltk_preprocessor(hp)
    run_sklearn_preprocessor(hp)

Preprocessing dataset...
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.7, 'tf_idf': True}
NLTK Preprocessing...
finished in 0 seconds.
sklearn preprocessing...
finished in 0 seconds.
{'isalpha': True, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.7, 'tf_idf': True}
NLTK Preprocessing...
finished in 0 seconds.
sklearn preprocessing...
finished in 0 seconds.
{'isalpha': True, 'alpha': 1.0, 'min_df': 0.01, 'max_df': 0.7, 'tf_idf': True}
NLTK Preprocessing...
finished in 0 seconds.
sklearn preprocessing...
finished in 0 seconds.
{'isalpha': True, 'alpha': 10.0, 'min_df': 0.01, 'max_df': 0.7, 'tf_idf': True}
NLTK Preprocessing...
finished in 0 seconds.
sklearn preprocessing...
finished in 0 seconds.
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.74, 'tf_idf': True}
NLTK Preprocessing...
finished in 0 seconds.
sklearn preprocessing...
Cache miss:  cache-True-True-0.01-0.74
finished in 1 seconds.
{'isalpha': True, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.74, 'tf_idf'

### Scores

In [0]:
#Callback para el procesamiento paralelo de Dask
def score_callback(dataset=None):
    def score_classifier(hp):
        print(hp.to_dict())
        
        if caching is True:
            cache_path = get_sklearn_cache_path(hp)
            with open (cache_path, 'rb') as fp:
                D = pickle.load(fp)
        else:
            D = dataset.copy()

        X = D[:,:D.shape[1]-1]
        Y = D[:,D.shape[1]-1:].flatten()

        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, shuffle=False)

        #Aca defino el clasificador
        clf = MultinomialNB(alpha=hp['alpha'], class_prior=None, fit_prior=False)
        
        #Obtengo el score
        scores = cross_val_score(clf, X_train, Y_train, cv=cross_sets)

        hp['score'] = scores.mean()
        
        return hp
    return score_classifier

In [15]:
print('Evaluating hyperparameters...')
to = time.time()
    
score_classifier = score_callback(dataset)
dhyperparameters = dd.from_pandas(hyperparameters.copy(), npartitions=os.cpu_count())
scores           = hyperparameters.apply(score_classifier, axis=1)

tf = time.time()
print('finished in', (int(tf-to)), 'seconds.')

Evaluating hyperparameters...
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.7, 'tf_idf': True}
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.7, 'tf_idf': True}
{'isalpha': True, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.7, 'tf_idf': True}
{'isalpha': True, 'alpha': 1.0, 'min_df': 0.01, 'max_df': 0.7, 'tf_idf': True}
{'isalpha': True, 'alpha': 10.0, 'min_df': 0.01, 'max_df': 0.7, 'tf_idf': True}
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.74, 'tf_idf': True}
{'isalpha': True, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.74, 'tf_idf': True}
{'isalpha': True, 'alpha': 1.0, 'min_df': 0.01, 'max_df': 0.74, 'tf_idf': True}
{'isalpha': True, 'alpha': 10.0, 'min_df': 0.01, 'max_df': 0.74, 'tf_idf': True}
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.78, 'tf_idf': True}
{'isalpha': True, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.78, 'tf_idf': True}
{'isalpha': True, 'alpha': 1.0, 'min_df': 0.01, 'max_df': 0.78, 'tf_idf': True}
{'isalpha

### Training phase

In [16]:
print('Training model with best hyperparameters...')

#Me quedo con la mejor combinación de hiperparámetros.
best_hp = scores.loc[scores['score'].idxmax()].drop(['score'])
print(best_hp.to_dict())

if caching is True:
    cache_path = get_sklearn_cache_path(best_hp)
    with open (cache_path, 'rb') as fp:
        D = pickle.load(fp)
else:
    D = dataset.copy()

X = D[:,:D.shape[1]-1]
Y = D[:,D.shape[1]-1:].flatten()

#Separamos el dataset para train y validation
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, shuffle=False)

#Creamos el clasificador para los mejores hiperparámetros
clf = MultinomialNB(alpha=best_hp['alpha'], class_prior=None, fit_prior=False)

#Entrenamos el modelo
clf.fit(X_train, Y_train)

Training model with best hyperparameters...
{'isalpha': False, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.7, 'tf_idf': True}


MultinomialNB(alpha=0.1, class_prior=None, fit_prior=False)

In [17]:
print('Evaluating best model...')
    
if caching is True:
    cache_path = get_sklearn_cache_path(best_hp)
    with open (cache_path, 'rb') as fp:
        D = pickle.load(fp)
else:
    D = dataset.copy()

X = D[:,:D.shape[1]-1]
Y = D[:,D.shape[1]-1:].flatten()

#Separo el set para train y test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, shuffle=False)
    
#Vemos el score final del modelo para test
score = clf.score(X_test, Y_test)
print("accuracy: {:.4}%".format(score*100))

Evaluating best model...
accuracy: 82.21%
