# Assignment 1:
The assignment consists in the development, in NLTK, OpenNLP, SketchEngine or GATE/Annie a Naïve Bayes Classifier able to detect a single class in one of the corpora available as attachments to the chosen package, by distinguishing ENGLISH against NON-ENGLISH. In particular the classifier has to be:

- Trained on a split subset of the chosen corpus, by either using an existing partition between sample documents for training and for test or by using a random splitter among the available ones;

- Devised as a pipeline of any chosen format, including the simplest version based on word2vec on a list of words obtained by one of the available lexical resources.

In [1]:
#************************ IMPORTS ************************#
import nltk
import asyncio
import random
import math 
import collections
from tqdm import tqdm 

In [2]:
from nltk.corpus import europarl_raw
# CORPUS DATA 
en_ids = [fileid for fileid in europarl_raw.english.fileids()]
dutch_ids = [fileid for fileid in europarl_raw.dutch.fileids()]
fr_ids = [fileid for fileid in europarl_raw.french.fileids()]

# Loading ENGLISH corpora and respective label 
documents= [(europarl_raw.english.raw(fileid), "English") 
            for fileid in en_ids]
# Loading FRENCH corpora and respective label 
for fileid in fr_ids:
    documents.append((europarl_raw.french.raw(fileid) , "NonEnglish"))
# Loading DUTCH corpora and respective label 
for fileid in dutch_ids:
    documents.append((europarl_raw.dutch.raw(fileid) , "NonEnglish"))
    
random.shuffle(documents)
print("Done")

Done


In [3]:
# STOPWORDS 
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))  
stop_words.add(word for word in stopwords.words("french"))
stop_words.add(word for word in stopwords.words("dutch"))

# STEMMER 
from nltk.stem import PorterStemmer 
stemmer = PorterStemmer()  

# LEMMATIZER
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [4]:
# TOKENIZATION, LEMMATIZING, STEMMMING AND STOP WORDS REMOVAL 

from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize, word_tokenize
fdist = FreqDist() # freqdist to keep counting w instances for creating BOW 
data = [0 for _ in range(len(documents))]

for i,(text,label) in enumerate(tqdm(documents)):
    appo = ([],label)

    sents = sent_tokenize(text)
    for sent in sents:
        words = word_tokenize(sent) 
        for word in words:
            if word.casefold() not in stop_words:
                stemmed = stemmer.stem(word.lower()) # Stemming 
                lemmatized = lemmatizer.lemmatize(stemmed) # Lemmatization
                fdist[lemmatized] += 1 # Increases Word Counter inside the Bag of Words
                appo[0].append(lemmatized) # Saves the Result

    data[i] = appo 
top_words = list(fdist)[:2000]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:44<00:00,  1.48s/it]

(['hervat', 'van', 'de', 'zit', 'ik', 'verklaar', 'de', 'zit', 'van', 'het', 'europe', 'parlement', ',', 'die', 'op', 'vrijdag', '21', 'januari', '2000', 'werd', 'onderbroken', ',', 'te', 'zijn', 'hervat', '.', 'mevrouw', 'de', 'voorzitt', ',', 'staat', 'u', 'mij', 'toe', 'erop', 'te', 'wijzen', 'dat', 'het', 'morgen', 'twee', 'jaar', 'geleden', 'dat', 'een', 'amerikaan', 'vliegtuig', 'van', 'de', 'navo-legerbasi', 'aviano', 'een', 'bloedbad', 'veroorzaakt', 'op', 'de', 'berg', 'cermi', 'bij', 'cavales', 'italië', 'door', 'tijden', 'een', 'oefenvlucht', 'bij', 'onvoldoend', 'hoogt', '-', 'lager', 'dan', 'de', 'veiligheidslimiet', '-', 'de', 'kabel', 'van', 'een', 'kabelbaan', 'door', 'te', 'snijden', '.', '20', 'europe', 'burger', 'vonden', 'hierbij', 'de', 'dood', '.', 'de', 'familieleden', 'van', 'de', 'slachtoff', 'hebben', 'nog', 'steed', 'geen', 'financiël', 'tegemoetkom', 'ontvangen', 'van', 'de', 'reger', 'van', 'de', 'verenigd', 'staten', '.', 'bovendien', 'de', 'verantwoordeli




In [5]:
# Feature Extraction: 
def feature_estractor(document,top_words):
    document_set = set(document)
    features = {}
    for word in top_words:
        features['contains({})'.format(word)] = (word in document_set)
    return features

featuresets = [(feature_estractor(d,top_words), c) for (d,c) in tqdm(data)]
train_test_split = math.floor(len(featuresets) * 0.7 )
train_set, test_set = featuresets[:train_test_split], featuresets[train_test_split:]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 237.97it/s]


In [6]:
classifier = nltk.NaiveBayesClassifier.train(train_set) 



In [7]:
from nltk.metrics.scores import (precision, recall)
from nltk.metrics import ConfusionMatrix
print("Testing and Metrics: ")
refsets =  collections.defaultdict(set)
testsets = collections.defaultdict(set)
labels = []
tests = []
for i,(feats,label) in enumerate(test_set):
    refsets[label].add(i)
    result = classifier.classify(feats)
    testsets[result].add(i)
    labels.append(label)
    tests.append(result)
    #print("True value: "+label+" Our value: "+result)
    
cm = ConfusionMatrix(labels, tests)
print("Accuracy:",nltk.classify.accuracy(classifier, test_set))
print( 'Precision:', precision(refsets['English'], testsets['English']) )
print( 'Recall:', recall(refsets['English'], testsets['English']) )
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))
classifier.show_most_informative_features(35)


Testing and Metrics: 
True value: English Our value: English
True value: NonEnglish Our value: NonEnglish
True value: NonEnglish Our value: NonEnglish
True value: NonEnglish Our value: NonEnglish
True value: NonEnglish Our value: NonEnglish
True value: NonEnglish Our value: NonEnglish
True value: English Our value: NonEnglish
True value: English Our value: English
True value: NonEnglish Our value: NonEnglish
Accuracy: 0.8888888888888888
Precision: 1.0
Recall: 0.6666666666666666
           |      N        |
           |      o        |
           |      n        |
           |      E      E |
           |      n      n |
           |      g      g |
           |      l      l |
           |      i      i |
           |      s      s |
           |      h      h |
-----------+---------------+
NonEnglish | <66.7%>     . |
   English |  11.1% <22.2%>|
-----------+---------------+
(row = reference; col = test)

Most Informative Features
       contains(account) = True           Englis : Non

In [8]:

test = test_set[1]
print(classifier.classify(test[0]),test[1])
test = test_set[3]
print(classifier.classify(test[0]),test[1])
test = test_set[0]
print(classifier.classify(test[0]),test[1])

NonEnglish NonEnglish
NonEnglish NonEnglish
English English
