# TAL - Classification de dépêches d’agence avec NLTK

In [1]:
# importing modules
import nltk
import string
import collections

from nltk.corpus import reuters
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.metrics.scores import (precision, recall, f_measure)

from random import shuffle

In [2]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()
documents = []
# Loop through each file id and collect each files categories and tokenized words
for file in fileids:
    words = reuters.words(file)
    documents.append((words, reuters.categories(file)))

shuffle(documents)
documents[0]

(['U', '.', 'S', '.', 'SUGAR', 'QUOTA', 'MAY', 'BE', ...], ['sugar'])

> Note: We've noticed that in the tokenized words of the corpus, the word `U.S` is split into three tokens

## Classifieurs binaire

TODO Explain features we're using

In [14]:
def document_features(document, word_frequence):
    document_words = set(document)
    features = {}
    for word in word_frequence:
        features['contains({})'.format(word)] = (word in document_words)
    return features

def most_freq_words(documents, limit=2000):
    all_words = nltk.FreqDist(w
        for document in documents
        for w in document[0]
    )
    return list(all_words)[:limit]

In [15]:
def create_dataset(documents, tag, feature_extractor, **kwargs):
    if 'to_lower' in kwargs and kwargs['to_lower']:
        documents = list(map(lambda d: (list(map(str.lower, d[0])), d[1]), documents))

    if 'lemmatizer' in kwargs:
        lemmatizer = kwargs['lemmatizer']
        documents = list(map(lambda d: (list(map(lemmatizer.lemmatize, d[0])), d[1]), documents))
    
    if 'stopwords' in kwargs:
        stopwords = set(kwargs['stopwords'])
        documents = list(map(
            lambda d: (
                list(filter(lambda w: not w.lower() in stopwords and w[0].isalnum(), d[0])), 
                d[1]
            ), documents))
        
    analyzer_res = []
    if 'analyzer' in kwargs:
        analyzer_res = kwargs['analyzer'](documents)

    dataset = []
    for document in documents:
        dataset.append((feature_extractor(document[0], analyzer_res), tag in document[1]))
    
    shuffle(dataset)
    return dataset

def split_dataset(dataset):
    split_ratio = 0.6
    split_ratio2 = 0.8
    
    split = int(len(dataset) * split_ratio)
    split2 = int(len(dataset) * split_ratio2)

    return (dataset[:split], dataset[split:split2], dataset[split2:])

In [26]:
def best_classifier(documents, tag, hyperparams):
    print('Finding best classifier for {}'.format(tag))
    print('----------')

    best = (None, 0.0)
    for hyperparam in hyperparams:
        dataset = create_dataset(documents, tag, **hyperparam)
        train_set, test_set, dev_set = split_dataset(dataset)
        classifier = nltk.NaiveBayesClassifier.train(train_set)
        acc = nltk.classify.accuracy(classifier, dev_set)
        
        if acc > best[1]:
            best = (classifier, acc)
        
        print('Accuracy using "{}": {:.2f}%'.format(hyperparam['title'], acc*100))
    return (best[0], test_set)

### Combiniation of possible hyperparameters

TODO explain the hyperparam we're going to use

In [20]:
hyperparams = [
    {
        'title': 'To lower: no, Lemmatize: no, No stopwords: no',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
    },
    {
        'title': 'To lower: yes, Lemmatize: no, No stopwords: no',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
    },
    {
        'title': 'To lower: no, Lemmatize: yes, No stopwords: no',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'lemmatizer': WordNetLemmatizer(),
    },
    {
        'title': 'To lower: no, Lemmatize: no, No stopwords: yes',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'stopwords': stopwords.words('english'),
    },
    {
        'title': 'To lower: yes, Lemmatize: no, No stopwords: yes',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
        'stopwords': stopwords.words('english'),
    },
    {
        'title': 'To lower: no, Lemmatize: yes, No stopwords: yes',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'lemmatizer': WordNetLemmatizer(),
        'stopwords': stopwords.words('english'),
    },
    {
        'title': 'To lower: yes, Lemmatize: yes, No stopwords: no',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
        'lemmatizer': WordNetLemmatizer(),
    },
    {
        'title': 'To lower: yes, Lemmatize: yes, No stopwords: yes',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
        'stopwords': stopwords.words('english'),
        'lemmatizer': WordNetLemmatizer(),
    },
]

### Classification des documents `money-fx`
TODO Comment results

In [28]:
classifier_moneyfx, moneyfx_testset = best_classifier(documents, 'money-fx', hyperparams)

Finding best classifier for money-fx
----------
Accuracy using "To lower: no, Lemmatize: no, No stopwords: no": 88.69%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: no": 88.32%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: no": 87.67%
Accuracy using "To lower: no, Lemmatize: no, No stopwords: yes": 91.38%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: yes": 90.50%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: yes": 89.94%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: no": 87.77%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: yes": 91.15%


### Classification des documents `wheat`
TODO Comment results

In [29]:
classifier_wheat, wheat_testset = best_classifier(documents, 'wheat', hyperparams)

Finding best classifier for wheat
----------
Accuracy using "To lower: no, Lemmatize: no, No stopwords: no": 91.01%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: no": 89.76%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: no": 91.57%
Accuracy using "To lower: no, Lemmatize: no, No stopwords: yes": 93.70%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: yes": 95.13%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: yes": 93.33%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: no": 90.64%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: yes": 93.10%


### Classification des documents `gold`
TODO Comment results

In [30]:
classifier_gold, gold_testset = best_classifier(documents, 'gold', hyperparams)

Finding best classifier for gold
----------
Accuracy using "To lower: no, Lemmatize: no, No stopwords: no": 94.07%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: no": 95.46%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: no": 95.27%
Accuracy using "To lower: no, Lemmatize: no, No stopwords: yes": 99.12%
Accuracy using "To lower: yes, Lemmatize: no, No stopwords: yes": 97.36%
Accuracy using "To lower: no, Lemmatize: yes, No stopwords: yes": 96.94%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: no": 96.48%
Accuracy using "To lower: yes, Lemmatize: yes, No stopwords: yes": 98.01%


In [22]:
def ref_test_sets(testset, classifier):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testset):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    return refsets, testsets

In [31]:
moneyfx_refsets, moneyfx_testsets = ref_test_sets(moneyfx_testset, classifier_moneyfx)
wheat_refsets, wheat_testsets = ref_test_sets(wheat_testset, classifier_wheat)
gold_refsets, gold_testsets = ref_test_sets(gold_testset, classifier_gold)

In [32]:
print('Money-fx:')
print('---------')
print('Precision:', precision(moneyfx_refsets[True], moneyfx_testsets[True]))
print('Recall:'   , recall(moneyfx_refsets[True], moneyfx_testsets[True]))
print('F-mesure:' , f_measure(moneyfx_refsets[True], moneyfx_testsets[True]))

print()

print('Wheat:')
print('---------')
print('Precision:', precision(wheat_refsets[True], wheat_testsets[True]))
print('Recall:'   , recall(wheat_refsets[True], wheat_testsets[True]))
print('F-mesure:' , f_measure(wheat_refsets[True], wheat_testsets[True]))

print()

print('Gold:')
print('---------')
print('Precision:', precision(gold_refsets[True], gold_testsets[True]))
print('Recall:'   , recall(gold_refsets[True], gold_testsets[True]))
print('F-mesure:' , f_measure(gold_refsets[True], gold_testsets[True]))

Money-fx:
---------
Precision: 0.3639344262295082
Recall: 0.8283582089552238
F-mesure: 0.5056947608200456

Wheat:
---------
Precision: 0.32098765432098764
Recall: 0.9285714285714286
F-mesure: 0.4770642201834862

Gold:
---------
Precision: 0.5306122448979592
Recall: 0.896551724137931
F-mesure: 0.6666666666666666


## Classifieur multiclasse

In [7]:
def create_multi_dataset(documents, tags, feature_extractor, **kwargs):
    if 'to_lower' in kwargs and kwargs['to_lower']:
        documents = list(map(lambda d: (list(map(str.lower, d[0])), d[1]), documents))

    if 'lemmatizer' in kwargs:
        lemmatizer = kwargs['lemmatizer']
        documents = list(map(lambda d: (list(map(lemmatizer.lemmatize, d[0])), d[1]), documents))
    
    if 'stopwords' in kwargs:
        stopwords = set(kwargs['stopwords'])
        documents = list(map(
            lambda d: (
                list(filter(lambda w: not w.lower() in stopwords and w[0].isalnum(), d[0])), 
                d[1]
            ), documents))
        
    analyzer_res = []
    if 'analyzer' in kwargs:
        analyzer_res = kwargs['analyzer'](documents)

    dataset = []
    for document in documents:
        document_tags = list(set(tags).intersection(document[1]))
        tag = 'other' if document_tags == [] else document_tags[0]

        dataset.append((feature_extractor(document[0], analyzer_res), tag))
    
    shuffle(dataset)
    return dataset

In [12]:
params = {
    'feature_extractor': document_features,
    'analyzer': most_freq_words,
}

f = ['money-fx', 'wheat', 'gold']
create_multi_dataset(documents, f, **params)[0]

({'contains(.)': True,
  'contains(,)': False,
  'contains(the)': True,
  'contains(of)': True,
  'contains(to)': False,
  'contains(in)': True,
  'contains(said)': True,
  'contains(and)': True,
  'contains(a)': True,
  'contains(mln)': True,
  'contains(vs)': False,
  'contains(-)': False,
  'contains(for)': False,
  'contains(dlrs)': False,
  "contains(')": False,
  'contains(The)': True,
  'contains(000)': False,
  'contains(1)': False,
  'contains(s)': False,
  'contains(pct)': False,
  'contains(it)': True,
  'contains(;)': True,
  'contains(&)': True,
  'contains(lt)': True,
  'contains(on)': True,
  'contains(from)': False,
  'contains(cts)': False,
  'contains(is)': True,
  'contains(>)': True,
  'contains(that)': True,
  'contains(its)': True,
  'contains(by)': False,
  'contains(")': False,
  'contains(at)': False,
  'contains(2)': False,
  'contains(U)': True,
  'contains(S)': True,
  'contains(year)': True,
  'contains(be)': True,
  'contains(with)': True,
  'contains(will

In [25]:
# micro / macro average?
# calculate precision, recall & f-score for each tag in multi class
# compare result with corresponding binary classifier