# TAL - Classification de dépêches d’agence avec NLTK

In [1]:
# importing modules
import nltk
import string
import collections

from nltk.corpus import reuters
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.metrics.scores import (precision, recall, f_measure)

from random import shuffle

In [2]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()
documents = []
# Loop through each file id and collect each files categories and tokenized words
for file in fileids:
    words = reuters.words(file)
    documents.append((words, reuters.categories(file)))

shuffle(documents)
documents[0]

(['NORTHERN', 'TELECOM', 'PROPOSES', 'TWO', '-', 'FOR', ...], ['earn'])

Nous avons remarqué que dans la tokenisation des mots du corpus, le mots `U.S` est séparé en trois tokens distinct, `U`, `.` et `S`. Nous avon estimé que dans le cadre de ce labo, cela ne devrait pas causer trop de problèmes et nous avons donc laissé cette séparation.

## Classifieur binaire

Pour la classification des documents, nous avons décidé d'utiliser la fréquence des mots. Nous avons donc commencé par déterminer la fréquence de **TOUT** les mots du dataset (i.e. tout les documents), puis les `2000` mots les plus fréquents sont retourné.

> Note: La limite de la fréquence des mots que la fonction retourne est paramètrable.

In [3]:
def document_features(document, word_frequence):
    document_words = set(document)
    features = {}
    for word in word_frequence:
        features['contains({})'.format(word)] = (word in document_words)
    return features

def most_freq_words(documents, limit=2000):
    all_words = nltk.FreqDist(w
        for document in documents
        for w in document[0]
    )
    return list(all_words)[:limit]

In [4]:
def create_dataset(documents, tag, feature_extractor, **kwargs):
    if 'to_lower' in kwargs and kwargs['to_lower']:
        documents = list(map(lambda d: (list(map(str.lower, d[0])), d[1]), documents))

    if 'lemmatizer' in kwargs:
        lemmatizer = kwargs['lemmatizer']
        documents = list(map(lambda d: (list(map(lemmatizer.lemmatize, d[0])), d[1]), documents))
    
    if 'stopwords' in kwargs:
        stopwords = set(kwargs['stopwords'])
        documents = list(map(
            lambda d: (
                list(filter(lambda w: not w.lower() in stopwords and w[0].isalnum(), d[0])), 
                d[1]
            ), documents))
        
    analyzer_res = []
    if 'analyzer' in kwargs:
        analyzer_res = kwargs['analyzer'](documents)

    dataset = []
    for document in documents:
        dataset.append((feature_extractor(document[0], analyzer_res), tag in document[1]))
    
    shuffle(dataset)
    return dataset

def split_dataset(dataset):
    split_ratio = 0.6
    split_ratio2 = 0.8
    
    split = int(len(dataset) * split_ratio)
    split2 = int(len(dataset) * split_ratio2)

    return (dataset[:split], dataset[split:split2], dataset[split2:])

In [5]:
def best_classifier(documents, tag, dataset_creator, hyperparams):
    print('Finding best classifier for {}'.format(tag))
    print('----------')

    best = (None, 0.0)
    for hyperparam in hyperparams:
        dataset = dataset_creator(documents, tag, **hyperparam)
        train_set, test_set, dev_set = split_dataset(dataset)
        classifier = nltk.NaiveBayesClassifier.train(train_set)
        acc = nltk.classify.accuracy(classifier, dev_set)
        
        if acc > best[1]:
            best = (classifier, acc)
        
        print('Accuracy using "{}": {:.2f}%'.format(hyperparam['title'], acc*100))
    return (best[0], test_set)

### Combinaison des différents hyperparamètres

TODO explain the hyperparam we're going to use

In [6]:
hyperparams = [
    {
        'title': 'To lower: no, Lemmatize: no, No stopwords: no',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
    },
    {
        'title': 'To lower: yes, Lemmatize: no, No stopwords: no',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
    },
    {
        'title': 'To lower: no, Lemmatize: yes, No stopwords: no',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'lemmatizer': WordNetLemmatizer(),
    },
    {
        'title': 'To lower: no, Lemmatize: no, No stopwords: yes',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'stopwords': stopwords.words('english'),
    },
    {
        'title': 'To lower: yes, Lemmatize: no, No stopwords: yes',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
        'stopwords': stopwords.words('english'),
    },
    {
        'title': 'To lower: no, Lemmatize: yes, No stopwords: yes',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'lemmatizer': WordNetLemmatizer(),
        'stopwords': stopwords.words('english'),
    },
    {
        'title': 'To lower: yes, Lemmatize: yes, No stopwords: no',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
        'lemmatizer': WordNetLemmatizer(),
    },
    {
        'title': 'To lower: yes, Lemmatize: yes, No stopwords: yes',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
        'stopwords': stopwords.words('english'),
        'lemmatizer': WordNetLemmatizer(),
    },
]

### Classification des documents `money-fx`
TODO Comment results

In [8]:
classifier_moneyfx, moneyfx_testset = best_classifier(documents, 'money-fx', create_dataset, hyperparams)

Finding best classifier for money-fx
----------
Accuracy using "To lower: no, Lemmatize: no, No stopwords: no": 87.72%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: no": 88.74%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: no": 87.86%
Accuracy using "To lower: no, Lemmatize: no, No stopwords: yes": 90.73%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: yes": 90.55%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: yes": 90.64%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: no": 87.53%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: yes": 91.71%


### Classification des documents `wheat`
TODO Comment results

In [9]:
classifier_wheat, wheat_testset = best_classifier(documents, 'wheat', create_dataset, hyperparams)

Finding best classifier for wheat
----------
Accuracy using "To lower: no, Lemmatize: no, No stopwords: no": 89.99%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: no": 89.76%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: no": 89.99%
Accuracy using "To lower: no, Lemmatize: no, No stopwords: yes": 93.51%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: yes": 94.86%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: yes": 93.23%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: no": 90.59%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: yes": 94.35%


### Classification des documents `gold`
TODO Comment results

In [10]:
classifier_gold, gold_testset = best_classifier(documents, 'gold', create_dataset, hyperparams)

Finding best classifier for gold
----------
Accuracy using "To lower: no, Lemmatize: no, No stopwords: no": 93.79%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: no": 95.09%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: no": 96.71%
Accuracy using "To lower: no, Lemmatize: no, No stopwords: yes": 98.75%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: yes": 98.42%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: yes": 98.05%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: no": 98.19%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: yes": 98.56%


In [22]:
def ref_test_sets(testset, classifier):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testset):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    return refsets, testsets

In [31]:
moneyfx_refsets, moneyfx_testsets = ref_test_sets(moneyfx_testset, classifier_moneyfx)
wheat_refsets, wheat_testsets = ref_test_sets(wheat_testset, classifier_wheat)
gold_refsets, gold_testsets = ref_test_sets(gold_testset, classifier_gold)

In [32]:
print('Money-fx:')
print('---------')
print('Precision:', precision(moneyfx_refsets[True], moneyfx_testsets[True]))
print('Recall:'   , recall(moneyfx_refsets[True], moneyfx_testsets[True]))
print('F-mesure:' , f_measure(moneyfx_refsets[True], moneyfx_testsets[True]))

print()

print('Wheat:')
print('---------')
print('Precision:', precision(wheat_refsets[True], wheat_testsets[True]))
print('Recall:'   , recall(wheat_refsets[True], wheat_testsets[True]))
print('F-mesure:' , f_measure(wheat_refsets[True], wheat_testsets[True]))

print()

print('Gold:')
print('---------')
print('Precision:', precision(gold_refsets[True], gold_testsets[True]))
print('Recall:'   , recall(gold_refsets[True], gold_testsets[True]))
print('F-mesure:' , f_measure(gold_refsets[True], gold_testsets[True]))

Money-fx:
---------
Precision: 0.3639344262295082
Recall: 0.8283582089552238
F-mesure: 0.5056947608200456

Wheat:
---------
Precision: 0.32098765432098764
Recall: 0.9285714285714286
F-mesure: 0.4770642201834862

Gold:
---------
Precision: 0.5306122448979592
Recall: 0.896551724137931
F-mesure: 0.6666666666666666


## Classifieur multiclasse

In [11]:
def create_multi_dataset(documents, tags, feature_extractor, **kwargs):
    if 'to_lower' in kwargs and kwargs['to_lower']:
        documents = list(map(lambda d: (list(map(str.lower, d[0])), d[1]), documents))

    if 'lemmatizer' in kwargs:
        lemmatizer = kwargs['lemmatizer']
        documents = list(map(lambda d: (list(map(lemmatizer.lemmatize, d[0])), d[1]), documents))
    
    if 'stopwords' in kwargs:
        stopwords = set(kwargs['stopwords'])
        documents = list(map(
            lambda d: (
                list(filter(lambda w: not w.lower() in stopwords and w[0].isalnum(), d[0])), 
                d[1]
            ), documents))
        
    analyzer_res = []
    if 'analyzer' in kwargs:
        analyzer_res = kwargs['analyzer'](documents)

    dataset = []
    for document in documents:
        document_tags = list(set(tags).intersection(document[1]))
        tag = 'other' if document_tags == [] else document_tags[0]

        dataset.append((feature_extractor(document[0], analyzer_res), tag))
    
    shuffle(dataset)
    return dataset

In [19]:
params = [{
    'title': 'mdr',
    'feature_extractor': document_features,
    'analyzer': most_freq_words,
}]

f = ['money-fx', 'wheat', 'gold']
#create_multi_dataset(documents, f, **params)[0]

In [21]:
classifier_multiclass, multiclass_testset = best_classifier(documents, ['money-fx', 'wheat', 'gold'], create_multi_dataset, hyperparams)

Finding best classifier for ['money-fx', 'wheat', 'gold']
----------
Accuracy using "To lower: no, Lemmatize: no, No stopwords: no": 80.77%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: no": 82.90%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: no": 81.84%
Accuracy using "To lower: no, Lemmatize: no, No stopwords: yes": 84.62%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: yes": 86.42%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: yes": 85.31%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: no": 80.63%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: yes": 86.93%


In [25]:
# micro / macro average?
# calculate precision, recall & f-score for each tag in multi class
# compare result with corresponding binary classifier