In [4]:
import random
import pprint
import numpy as np
from nltk.corpus.europarl_raw import german, italian, english, french

pp = pprint.PrettyPrinter(indent=4)

def print_ndistinct_words(word_list):
    print('Number of distinct words: {}'.format(len(set(word_list))))

    
def sample_data(data, n):
    """
    Return random sample of data of size n

    """
    return random.sample(data, n)
#show lengths
print_ndistinct_words(german.words())
print_ndistinct_words(italian.words())
print_ndistinct_words(english.words())
print_ndistinct_words(french.words())

#get unique words
german_words = set(german.words())
italian_words = set(italian.words())
english_words = set(english.words())
french_words = set(french.words())

#show a bit of data
pp.pprint(sample_data(german_words, 5))
pp.pprint(sample_data(italian_words, 5))
pp.pprint(sample_data(english_words, 5))
pp.pprint(sample_data(french_words, 5))

Number of distinct words: 31230
Number of distinct words: 24375
Number of distinct words: 16495
Number of distinct words: 21809
['stammenden', 'Zinsen', 'transparentes', 'Terroranschläge', '+']
['caldo', 'irto', 'misto', 'kosovara', 'terzo']
['team', 'Akkuyu', 'alleviation', 'peninsula', 'cosmopolitan']
['ravages', 'lever', 'tenez', 'instituer', 'réinvesties']


In [5]:
#prepare labelled dataset
german_data = [(0, token) for token in german_words]
italian_data = [(1, token) for token in italian_words]
english_data = [(2, token) for token in english_words]
french_data = [(3, token) for token in french_words]

full_data = german_data + italian_data + english_data + french_data
print(len(full_data))
pp.pprint(sample_data(full_data, 20))

93909
[   (0, 'EU-Perspektive'),
    (1, 'alchimisti'),
    (0, 'Velzen'),
    (3, 'contraignent'),
    (1, 'Bassa-Normandia'),
    (1, 'Creatore'),
    (1, 'sottoporgli'),
    (0, 'gefoltert'),
    (0, 'Personen'),
    (1, 'incomprensibile'),
    (1, 'acute'),
    (2, 'repatriating'),
    (0, 'Schäden'),
    (3, 'Europe-Afrique'),
    (1, 'conforme'),
    (1, 'risposto'),
    (3, 'dupés'),
    (0, 'Transportbereich'),
    (3, '350'),
    (3, 'attestations')]


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
#bag of characters, bag of character ngrams
cv = CountVectorizer(analyzer='char', preprocessor=None, lowercase=False, ngram_range=(2,3))

corpus = [d[1] for d in full_data]

X = cv.fit_transform(corpus)
print(type(X)) #sparse matrix
print(X.shape)

Y = np.array([d[0] for d in full_data])
print(Y.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(93909, 16006)
(93909,)


In [7]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

def compute_score(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

clf = LinearSVC()
scores = cross_val_score(clf, X, Y, cv=3, scoring='f1_macro')
print(scores)
print(np.mean(scores), np.std(scores))

[0.81025324 0.80848333 0.8060338 ]
0.8082567890145214 0.0017300137990102621
