# TAL - Classification de dépêches d’agence avec NLTK

In [1]:
# importing modules
import nltk
import string
import collections

from nltk.corpus import reuters
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.metrics.scores import (precision, recall, f_measure)

from random import shuffle

In [76]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()
documents = []
# Loop through each file id and collect each files categories and tokenized words
for file in fileids:
    words = reuters.words(file)
    documents.append((words, reuters.categories(file)))

shuffle(documents)
documents[0]

(['JAPAN', 'BUYS', 'CANADIAN', 'RAPESEED', 'Japanese', ...],
 ['oilseed', 'rapeseed'])

> Note: We've noticed that in the tokenized words of the corpus, the word `U.S` is split into three tokens

## Classifieurs binaire

In [77]:
def document_features(document, word_frequence):
    document_words = set(document)
    features = {}
    for word in word_frequence:
        features['contains({})'.format(word)] = (word in document_words)
    return features

def most_freq_words(documents, limit=2000):
    all_words = nltk.FreqDist(w
        for document in documents
        for w in document[0]
    )
    return list(all_words)[:limit]

In [78]:
def create_dataset(documents, tag, feature_extractor, **kwargs):
    if 'to_lower' in kwargs and kwargs['to_lower']:
        documents = list(map(lambda d: (list(map(str.lower, d[0])), d[1]), documents))

    if 'lemmatizer' in kwargs:
        lemmatizer = kwargs['lemmatizer']
        documents = list(map(lambda d: (list(map(lemmatizer.lemmatize, d[0])), d[1]), documents))
    
    if 'stopwords' in kwargs:
        stopwords = set(kwargs['stopwords'])
        documents = list(map(
            lambda d: (
                list(filter(lambda w: not w.lower() in stopwords and w[0].isalnum(), d[0])), 
                d[1]
            ), documents))
        
    analyzer_res = []
    if 'analyzer' in kwargs:
        analyzer_res = kwargs['analyzer'](documents)

    dataset = []
    for document in documents:
        dataset.append((feature_extractor(document[0], analyzer_res), tag in document[1]))
    
    shuffle(dataset)
    return dataset

def split_dataset(dataset):
    split_ratio = 0.6
    split_ratio2 = 0.8
    
    split = int(len(dataset) * split_ratio)
    split2 = int(len(dataset) * split_ratio2)

    return (dataset[:split], dataset[split:split2], dataset[split2:])

In [79]:
def best_classifier(documents, tag, hyperparams):
    print('Finding best classifier for {}'.format(tag))
    print('----------')

    best = (None, 0.0)
    for hyperparam in hyperparams:
        dataset = create_dataset(documents, 'money-fx', **hyperparam)
        train_set, test_set, dev_set = split_dataset(dataset)
        classifier = nltk.NaiveBayesClassifier.train(train_set)
        acc = nltk.classify.accuracy(classifier, dev_set)
        
        if acc > best[1]:
            best = (classifier, acc)
        
        print('Accuracy using "{}": {:.2f}%'.format(hyperparam['title'], acc*100))
    return (best[0], test_set)

### Classification des documents `money-fx`

In [89]:
hyperparams = [
    {
        'title': 'Most frequent words',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
    },
    {
        'title': 'Lowered most frequent words',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
    },
    {
        'title': 'Lemmatized most frequent words',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'lemmatizer': WordNetLemmatizer(),
    },
    {
        'title': 'Most frequent words without stopwords',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'stopwords': stopwords.words('english'),
    },
]

best_classifier(documents, 'money-supply', hyperparams)
print()

Finding best classifier for money-supply
----------
Accuracy using "Most frequent words": 89.57%
Accuracy using "Lowered most frequent words": 88.55%
Accuracy using "Lemmatized most frequent words": 88.00%
Accuracy using "Most frequent words without stopwords": 90.64%



In [90]:
hyperparams_wo_sw = [
    {
        'title': 'Lowered most frequent words without stopwords',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
        'stopwords': stopwords.words('english'),
    },
    {
        'title': 'Lemmatized most frequent words without stopwords',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'lemmatizer': WordNetLemmatizer(),
        'stopwords': stopwords.words('english'),
    },
    {
        'title': 'Lowered lemmatized most frequent words without stopwords',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
        'stopwords': stopwords.words('english'),
        'lemmatizer': WordNetLemmatizer(),
    },
]

classifier_moneyfx, moneyfx_testset = best_classifier(documents, 'money-supply', hyperparams_wo_sw)

Finding best classifier for money-supply
----------
Accuracy using "Lowered most frequent words without stopwords": 90.59%
Accuracy using "Lemmatized most frequent words without stopwords": 92.96%
Accuracy using "Lowered lemmatized most frequent words without stopwords": 91.57%


### Classification des documents `wheat`

In [82]:
hyperparams = [
    {
        'title': 'Most frequent words',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
    },
    {
        'title': 'Lowered most frequent words',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
    },
    {
        'title': 'Lemmatized most frequent words',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'lemmatizer': WordNetLemmatizer(),
    },
    {
        'title': 'Most frequent words without stopwords',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'stopwords': stopwords.words('english'),
    },
]

best_classifier(documents, 'wheat', hyperparams)
print()

Finding best classifier for wheat
----------
Accuracy using "Most frequent words": 89.39%
Accuracy using "Lowered most frequent words": 89.06%
Accuracy using "Lemmatized most frequent words": 89.25%
Accuracy using "Most frequent words without stopwords": 91.43%



In [83]:
hyperparams_wo_sw = [
    {
        'title': 'Lowered most frequent words without stopwords',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
        'stopwords': stopwords.words('english'),
    },
    {
        'title': 'Lemmatized most frequent words without stopwords',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'lemmatizer': WordNetLemmatizer(),
        'stopwords': stopwords.words('english'),
    },
    {
        'title': 'Lowered lemmatized most frequent words without stopwords',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
        'stopwords': stopwords.words('english'),
        'lemmatizer': WordNetLemmatizer(),
    },
]

classifier_wheat, wheat_testset = best_classifier(documents, 'wheat', hyperparams_wo_sw)

Finding best classifier for wheat
----------
Accuracy using "Lowered most frequent words without stopwords": 90.50%
Accuracy using "Lemmatized most frequent words without stopwords": 91.33%
Accuracy using "Lowered lemmatized most frequent words without stopwords": 91.15%


### Classification des documents `gold`

In [84]:
hyperparams = [
    {
        'title': 'Most frequent words',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
    },
    {
        'title': 'Lowered most frequent words',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
    },
    {
        'title': 'Lemmatized most frequent words',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'lemmatizer': WordNetLemmatizer(),
    },
    {
        'title': 'Most frequent words without stopwords',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'stopwords': stopwords.words('english'),
    },
]

best_classifier(documents, 'gold', hyperparams)
print()

Finding best classifier for gold
----------
Accuracy using "Most frequent words": 88.97%
Accuracy using "Lowered most frequent words": 88.37%
Accuracy using "Lemmatized most frequent words": 87.49%
Accuracy using "Most frequent words without stopwords": 90.22%



In [85]:
hyperparams_wo_sw = [
    {
        'title': 'Lowered most frequent words without stopwords',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
        'stopwords': stopwords.words('english'),
    },
    {
        'title': 'Lemmatized most frequent words without stopwords',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'lemmatizer': WordNetLemmatizer(),
        'stopwords': stopwords.words('english'),
    },
    {
        'title': 'Lowered lemmatized most frequent words without stopwords',
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
        'stopwords': stopwords.words('english'),
        'lemmatizer': WordNetLemmatizer(),
    },
]

classifier_gold, gold_testset = best_classifier(documents, 'gold', hyperparams_wo_sw)

Finding best classifier for gold
----------
Accuracy using "Lowered most frequent words without stopwords": 91.66%
Accuracy using "Lemmatized most frequent words without stopwords": 91.47%
Accuracy using "Lowered lemmatized most frequent words without stopwords": 92.08%


In [86]:
def ref_test_sets(testset, classifier):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testset):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    return refsets, testsets

In [91]:
moneyfx_refsets, moneyfx_testsets = ref_test_sets(moneyfx_testset, classifier_moneyfx)
wheat_refsets, wheat_testsets = ref_test_sets(wheat_testset, classifier_wheat)
gold_refsets, gold_testsets = ref_test_sets(gold_testset, classifier_gold)

In [92]:
print('Money-fx:')
print('---------')
print('Precision:', precision(moneyfx_refsets[True], moneyfx_testsets[True]))
print('Recall:'   , recall(moneyfx_refsets[True], moneyfx_testsets[True]))
print('F-mesure:' , f_measure(moneyfx_refsets[True], moneyfx_testsets[True]))

print()

print('Wheat:')
print('---------')
print('Precision:', precision(wheat_refsets[True], wheat_testsets[True]))
print('Recall:'   , recall(wheat_refsets[True], wheat_testsets[True]))
print('F-mesure:' , f_measure(wheat_refsets[True], wheat_testsets[True]))

print()

print('Gold:')
print('---------')
print('Precision:', precision(gold_refsets[True], gold_testsets[True]))
print('Recall:'   , recall(gold_refsets[True], gold_testsets[True]))
print('F-mesure:' , f_measure(gold_refsets[True], gold_testsets[True]))

Money-fx:
---------
Precision: 0.34563758389261745
Recall: 0.7744360902255639
F-mesure: 0.47795823665893267

Wheat:
---------
Precision: 0.38924050632911394
Recall: 0.7884615384615384
F-mesure: 0.5211864406779662

Gold:
---------
Precision: 0.36824324324324326
Recall: 0.8257575757575758
F-mesure: 0.5093457943925235


## Classifieur multiclasse