# Assignment 1:
The assignment consists in the development, in NLTK, OpenNLP, SketchEngine or GATE/Annie a Naïve Bayes Classifier able to detect a single class in one of the corpora available as attachments to the chosen package, by distinguishing ENGLISH against NON-ENGLISH. In particular the classifier has to be:

- Trained on a split subset of the chosen corpus, by either using an existing partition between sample documents for training and for test or by using a random splitter among the available ones;

- Devised as a pipeline of any chosen format, including the simplest version based on word2vec on a list of words obtained by one of the available lexical resources.

In [1]:
#************************ IMPORTS ************************#
import nltk
import asyncio
import random
import math 
import collections
from tqdm import tqdm 
# from sklearn import f1 

In [2]:
from nltk.corpus import europarl_raw # Euro Parlamentars speeches 
from nltk.corpus import state_union as union # America's presidents Union Day speeches 
# CORPUS DATA 

# Creating iterators containing all the needed file ids
en_ids = [fileid for fileid in europarl_raw.english.fileids()]
dutch_ids = [fileid for fileid in europarl_raw.dutch.fileids()]
fr_ids = [fileid for fileid in europarl_raw.french.fileids()]
union_ids = [fileid for fileid in union.fileids()]

# Loading ENGLISH euro_parlcorpora and adding the English label 
documents= [(europarl_raw.english.raw(fileid), "English") for fileid in en_ids]

# Loading America's union speechs corpora and adding the English label 
for fileid in union_ids:
    documents.append((union.raw(fileid) , "English"))

# Loading FRENCH corpora and  label 
for fileid in fr_ids:
    documents.append((europarl_raw.french.raw(fileid) , "NonEnglish"))
    
# Loading DUTCH corpora and respective label 
for fileid in dutch_ids:
    documents.append((europarl_raw.dutch.raw(fileid) , "NonEnglish"))
    
random.shuffle(documents)
print("Done")

Done


In [3]:
# STOPWORDS 
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))  
stop_words.add(word for word in stopwords.words("french"))
stop_words.add(word for word in stopwords.words("dutch"))

# STEMMER 
from nltk.stem import PorterStemmer 
stemmer = PorterStemmer()  

# LEMMATIZER
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [4]:
# TOKENIZATION, LEMMATIZING, STEMMMING AND STOP WORDS REMOVAL 

from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize, word_tokenize
fdist = FreqDist() # freqdist to keep counting w instances for creating BOW 
data = [0 for _ in range(len(documents))]

for i,(text,label) in enumerate(tqdm(documents)):
    appo = ([],label)

    sents = sent_tokenize(text)
    for sent in sents:
        words = word_tokenize(sent) 
        for word in words:
            if word.casefold() not in stop_words:
                stemmed = stemmer.stem(word.lower()) # Stemming 
                lemmatized = lemmatizer.lemmatize(stemmed) # Lemmatization
                fdist[lemmatized] += 1 # Increases Word Counter inside the Bag of Words
                appo[0].append(lemmatized) # Saves the Result

    data[i] = appo 
top_words = list(fdist)[:2000]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 95/95 [00:55<00:00,  1.72it/s]


In [5]:
# Feature Extraction: 
def feature_estractor(document,top_words):
    document_set = set(document)
    features = {}
    for word in top_words:
        features['contains({})'.format(word)] = (word in document_set)
    return features

featuresets = [(feature_estractor(d,top_words), c) for (d,c) in tqdm(data)]
train_test_split = math.floor(len(featuresets) * 0.7 )
train_set, test_set = featuresets[:train_test_split], featuresets[train_test_split:]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 95/95 [00:00<00:00, 410.55it/s]


In [6]:
classifier = nltk.NaiveBayesClassifier.train(train_set) 



In [7]:
from nltk.metrics.scores import (precision, recall)
from nltk.metrics import ConfusionMatrix
print("Testing and Metrics: ")
refsets =  collections.defaultdict(set)
testsets = collections.defaultdict(set)
labels = []
tests = []
for i,(feats,label) in enumerate(test_set):
    refsets[label].add(i)
    result = classifier.classify(feats)
    testsets[result].add(i)
    labels.append(label)
    tests.append(result)
    #print("True value: "+label+" Our value: "+result)
    
cm = ConfusionMatrix(labels, tests)
print("Accuracy:",nltk.classify.accuracy(classifier, test_set))
prec = precision(refsets['English'], testsets['English'])
print( 'Precision:', prec )
rec = recall(refsets['English'], testsets['English'])
print( 'Recall:', rec )
f1 = 2 *(prec*rec)/(prec+rec)
print("F1 score:", f1)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))
classifier.show_most_informative_features(35)


Testing and Metrics: 
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 score: 1.0
           |             N |
           |             o |
           |             n |
           |      E      E |
           |      n      n |
           |      g      g |
           |      l      l |
           |      i      i |
           |      s      s |
           |      h      h |
-----------+---------------+
   English | <86.2%>     . |
NonEnglish |      . <13.8%>|
-----------+---------------+
(row = reference; col = test)

Most Informative Features
          contains(also) = False          NonEng : Englis =     33.0 : 1.0
          contains(come) = False          NonEng : Englis =     33.0 : 1.0
       contains(countri) = False          NonEng : Englis =     33.0 : 1.0
           contains(day) = False          NonEng : Englis =     33.0 : 1.0
            contains(en) = True           NonEng : Englis =     33.0 : 1.0
         contains(everi) = False          NonEng : Englis =     33.0 : 1.0
          

In [8]:
lenght = len(test_set)
for i in range(lenght):
    test = test_set[i]
    print(f"Sample {i}: Our_result => {classifier.classify(test[0])} True_result => {test[1]}")
    if classifier.classify(test[0]) == test[1] :
        print("Right Value\n")
    else:
        print("Wrong Value\n")


Sample 0: Our_result => English True_result => English
Right Value

Sample 1: Our_result => English True_result => English
Right Value

Sample 2: Our_result => English True_result => English
Right Value

Sample 3: Our_result => English True_result => English
Right Value

Sample 4: Our_result => English True_result => English
Right Value

Sample 5: Our_result => English True_result => English
Right Value

Sample 6: Our_result => English True_result => English
Right Value

Sample 7: Our_result => NonEnglish True_result => NonEnglish
Right Value

Sample 8: Our_result => English True_result => English
Right Value

Sample 9: Our_result => English True_result => English
Right Value

Sample 10: Our_result => NonEnglish True_result => NonEnglish
Right Value

Sample 11: Our_result => English True_result => English
Right Value

Sample 12: Our_result => English True_result => English
Right Value

Sample 13: Our_result => English True_result => English
Right Value

Sample 14: Our_result => English