# TAL - Classification de dépêches d’agence avec NLTK

In [1]:
# importing modules
import nltk
import string

from nltk.corpus import reuters
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

from random import shuffle

In [2]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()
documents = []
# Loop through each file id and collect each files categories and tokenized words
for file in fileids:
    words = reuters.words(file)
    documents.append((words, reuters.categories(file)))

shuffle(documents)
documents[0]

(['FLEET', '&', 'lt', ';', 'FLT', '>', 'COULD', 'FACE', ...], ['acq'])

> Note: We've noticed that in the tokenized words of the corpus, the word `U.S` is split into three tokens

## Classifieurs binaire

### Classification des documents `money-fx`

In [9]:
def document_features(document, word_frequence):
    document_words = set(document)
    features = {}
    for word in word_frequence:
        features['contains({})'.format(word)] = (word in document_words)
    return features

def most_freq_words(documents, limit=2000):
    all_words = nltk.FreqDist(w
        for document in documents
        for w in document[0]
    )
    return list(all_words)[:limit]

In [6]:
def create_dataset(documents, tag, feature_extractor, **kwargs):
    if 'to_lower' in kwargs and kwargs['to_lower']:
        documents = list(map(lambda d: (list(map(str.lower, d[0])), d[1]), documents))

    if 'lemmatizer' in kwargs:
        lemmatizer = kwargs['lemmatizer']
        documents = list(map(lambda d: (list(map(lemmatizer.lemmatize, d[0])), d[1]), documents))
    
    if 'stopwords' in kwargs:
        stopwords = set(kwargs['stopwords'])
        documents = list(map(
            lambda d: (
                list(filter(lambda w: not w.lower() in stopwords and w[0].isalnum(), d[0])), 
                d[1]
            ), documents))
        
    analyzer_res = []
    if 'analyzer' in kwargs:
        analyzer_res = kwargs['analyzer'](documents)

    dataset = []
    for document in documents:
        dataset.append((feature_extractor(document[0], analyzer_res), tag in document[1]))
    
    shuffle(dataset)
    return dataset

def split_dataset(dataset):
    split_ratio = 0.6
    split_ratio2 = 0.8
    
    split = int(len(dataset) * split_ratio)
    split2 = int(len(dataset) * split_ratio2)

    return (dataset[:split], dataset[split:split2], dataset[split2:])

In [13]:
hyperparams = [
    {
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
    },
    {
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'to_lower': True,
    },
    {
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'lemmatizer': WordNetLemmatizer()
    },
    {
        'feature_extractor': document_features,
        'analyzer': most_freq_words,
        'stopwords': stopwords.words('english')
    },
]

for hyperparam in hyperparams:
    dataset = create_dataset(documents, 'money-fx', **hyperparam)
    train_set, test_set, dev_set = split_dataset(dataset)
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print(nltk.classify.accuracy(classifier, dev_set))

0.8730305838739574
0.8748841519925857


### Deuxième classifieur

### Troisième classifieur

## Classifieur multiclasse