# Building the Classifier

## 1. Loading the corpus

In [207]:
import re
from nltk import sent_tokenize

cg_sents = []
smg_sents = []

def remove_duplicate_punctuation(s): # sent_tokenize() gets confused when there's duplicate punctuation 
    return(re.sub(r'(\.|\?|!)\1+', r'\1', s))
    
with open('./Data/cg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p] # sent_tokenize() doesn't consider a new line a new sentence so this is required.
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)

with open('./Data/smg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)

cg_sents[:3]

['Πρασινο αυκουι μες το πασχαλινο ποτήρι που έπιασε ο μιτσης #αισχος 🤣🤣🤣   @HARRIS_APOEL https://t.co/y9X7CmBEa5',
 '@HARRIS_APOEL @pirpoitis @vassrules Καμνουν ανακαινιση στα Περβολια φετος.',
 '@MUFCChristian Ελα συγγενη τζιαι εχουμε νεοτερα π το Νικολη.']

## 2. Cleaning the text

In [208]:
import unicodedata
from string import punctuation
from nltk.tokenize import WhitespaceTokenizer

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def get_clean_sent_el(sentence):
    sentence = re.sub(r'^RT', '', sentence)
    sentence = re.sub(r'\&\w*;', '', sentence)
    sentence = re.sub(r'\@\w*', '', sentence)
    sentence = re.sub(r'\$\w*', '', sentence)
    sentence = re.sub(r'https?:\/\/.*\/\w*', '', sentence)
    sentence = ''.join(c for c in sentence if c <= '\uFFFF')
    sentence = strip_accents(sentence)
    sentence = re.sub(r'#\w*', '', sentence)
    tokens = WhitespaceTokenizer().tokenize(sentence)
    new_tokens = []
    for token in tokens:
        if token == 'ο,τι' or token == 'ό,τι' or token == 'o,ti' or token == 'ó,ti':
            new_tokens.append(token)
        else:
            new_tokens.append(token.translate(str.maketrans({key: None for key in punctuation})))
    sentence =' '.join(new_tokens)
    sentence = sentence.strip(' ') # performs lstrip() and rstrip()
    return sentence.lower()

cg_sents_clean = []
smg_sents_clean = []

for sent in cg_sents:
    cg_sents_clean.append(get_clean_sent_el(sent))
for sent in smg_sents:
    smg_sents_clean.append(get_clean_sent_el(sent))

# Remove empty strings left due to sentences ending up being only URLs then getting deleted on cleaning:
cg_sents_clean = list(filter(None, cg_sents_clean))
smg_sents_clean = list(filter(None, smg_sents_clean))
cg_sents_clean[:3]

['πρασινο αυκουι μες το πασχαλινο ποτηρι που επιασε ο μιτσης',
 'καμνουν ανακαινιση στα περβολια φετος',
 'ελα συγγενη τζιαι εχουμε νεοτερα π το νικολη']

## 3. Tokenization

In [209]:
cg_sents_tokens = []
smg_sents_tokens = []

for sent in cg_sents_clean:
    cg_sents_tokens.append(WhitespaceTokenizer().tokenize(sent))
for sent in smg_sents_clean:
    smg_sents_tokens.append(WhitespaceTokenizer().tokenize(sent))
    
cg_sents_tokens[:3]

[['πρασινο',
  'αυκουι',
  'μες',
  'το',
  'πασχαλινο',
  'ποτηρι',
  'που',
  'επιασε',
  'ο',
  'μιτσης'],
 ['καμνουν', 'ανακαινιση', 'στα', 'περβολια', 'φετος'],
 ['ελα', 'συγγενη', 'τζιαι', 'εχουμε', 'νεοτερα', 'π', 'το', 'νικολη']]

## 4. Building the feature extractor

In [210]:
from nltk import ngrams

def get_word_ngrams(tokens, n):
    ngrams_list = []
    ngrams_list.append(list(ngrams(tokens, n)))
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = '%s'
    for i in range(1, n):
        format_string += (' %s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

def get_char_ngrams(word, n):
    ngrams_list = []
    word = re.sub(r'ς', 'σ', word)
    ngrams_list.append(list(ngrams(word, n, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_')))
    
    # Removing redundant ngrams:
    if (n > 2):
        redundant_combinations = n - 2
        ngrams_list = [ngram_list[redundant_combinations : -redundant_combinations] for ngram_list in ngrams_list]
    
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = ''
    for i in range(0, n):
        format_string += ('%s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

In [211]:
# Feature extractor
def get_ngram_features(sentence_tokens):
    features = {}
    
    # Word unigrams
    ngrams = get_word_ngrams(sentence_tokens, 1)
    for ngram in ngrams:
        features[f'word({ngram})'] = features.get(f'word({ngram})', 0) + 1 # The second parameter to .get() is a default value if the key doesn't exist.
    
    # Word bigrams
    ngrams = get_word_ngrams(sentence_tokens, 2)
    for ngram in ngrams:
        features[f'word_bigram({ngram})'] = features.get(f'word_bigram({ngram})', 0) + 1
    
    # Word trigrams
    ngrams = get_word_ngrams(sentence_tokens, 3)
    for ngram in ngrams:
        features[f'word_trigram({ngram})'] = features.get(f'word_trigram({ngram})', 0) + 1
    
    # Word quadrigrams
    ngrams = get_word_ngrams(sentence_tokens, 4)
    for ngram in ngrams:
        features[f'word_quadrigram({ngram})'] = features.get(f'word_quadrigram({ngram})', 0) + 1
    
    # Char unigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 1)
        for ngram in ngrams:
            features[f'char({ngram})'] = features.get(f'char({ngram})', 0) + 1
    
    # Char bigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 2)
        for ngram in ngrams:
            features[f'char_bigram({ngram})'] = features.get(f'char_bigram({ngram})', 0) + 1
    
    # Char trigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 3)
        for ngram in ngrams:
            features[f'char_trigram({ngram})'] = features.get(f'char_trigram({ngram})', 0) + 1
    
    # Char quadrigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 4)
        for ngram in ngrams:
            features[f'char_quadrigram({ngram})'] = features.get(f'char_quadrigram({ngram})', 0) + 1
    
    return features

# get_ngram_features(['αυτη', 'ειναι', 'η', 'σπαρτη'])
get_ngram_features(['test', 'sentence'])

  """


{'char(c)': 1,
 'char(e)': 4,
 'char(n)': 2,
 'char(s)': 2,
 'char(t)': 3,
 'char_bigram(_s)': 1,
 'char_bigram(_t)': 1,
 'char_bigram(ce)': 1,
 'char_bigram(e_)': 1,
 'char_bigram(en)': 2,
 'char_bigram(es)': 1,
 'char_bigram(nc)': 1,
 'char_bigram(nt)': 1,
 'char_bigram(se)': 1,
 'char_bigram(st)': 1,
 'char_bigram(t_)': 1,
 'char_bigram(te)': 2,
 'char_quadrigram(_sen)': 1,
 'char_quadrigram(_tes)': 1,
 'char_quadrigram(ence)': 1,
 'char_quadrigram(ente)': 1,
 'char_quadrigram(est_)': 1,
 'char_quadrigram(nce_)': 1,
 'char_quadrigram(nten)': 1,
 'char_quadrigram(sent)': 1,
 'char_quadrigram(tenc)': 1,
 'char_quadrigram(test)': 1,
 'char_trigram(_se)': 1,
 'char_trigram(_te)': 1,
 'char_trigram(ce_)': 1,
 'char_trigram(enc)': 1,
 'char_trigram(ent)': 1,
 'char_trigram(est)': 1,
 'char_trigram(nce)': 1,
 'char_trigram(nte)': 1,
 'char_trigram(sen)': 1,
 'char_trigram(st_)': 1,
 'char_trigram(ten)': 1,
 'char_trigram(tes)': 1,
 'word(sentence)': 1,
 'word(test)': 1,
 'word_bigram(test 

## 5. Labeling the sentences

In [212]:
# cg_sents_features_labeled = [(get_ngram_features(word), 'cg') for word in cg_sents_tokens]
# smg_sents_features_labeled = [(get_ngram_features(word), 'smg') for word in smg_sents_tokens]

# all_sents_features_labeled = cg_sents_features_labeled + smg_sents_features_labeled
# all_sents_features_labeled[0]

all_sents_labeled = ([(sentence, 'cg') for sentence in cg_sents_tokens] + [(sentence, 'smg') for sentence in smg_sents_tokens])
all_sents_labeled[0]                                      

(['πρασινο',
  'αυκουι',
  'μες',
  'το',
  'πασχαλινο',
  'ποτηρι',
  'που',
  'επιασε',
  'ο',
  'μιτσης'],
 'cg')

## 6. Splitting corpus into training and test data

In [213]:
import random
from nltk.classify import apply_features

NO_ALL_SENTENCES = len(all_sents_labeled)
NO_TRAIN_SENTENCES = round(NO_ALL_SENTENCES * .8)

random.shuffle(all_sents_labeled)

print('DATASET\t', 'SENTENCES')
print('All\t', NO_ALL_SENTENCES)
print('Training', NO_TRAIN_SENTENCES)
print('Testing\t', NO_ALL_SENTENCES - NO_TRAIN_SENTENCES)

train_set = apply_features(get_ngram_features, all_sents_labeled[:NO_TRAIN_SENTENCES])
test_set = apply_features(get_ngram_features, all_sents_labeled[NO_TRAIN_SENTENCES:])

DATASET	 SENTENCES
All	 152
Training 122
Testing	 30


In [214]:
train_set[0]

({'char(α)': 18,
  'char(γ)': 1,
  'char(δ)': 2,
  'char(ε)': 24,
  'char(η)': 3,
  'char(θ)': 1,
  'char(ι)': 8,
  'char(κ)': 4,
  'char(λ)': 3,
  'char(μ)': 7,
  'char(ν)': 4,
  'char(ο)': 1,
  'char(π)': 4,
  'char(ρ)': 2,
  'char(σ)': 13,
  'char(τ)': 12,
  'char(υ)': 1,
  'char(φ)': 3,
  'char(ω)': 1,
  'char_bigram(_α)': 2,
  'char_bigram(_γ)': 1,
  'char_bigram(_δ)': 1,
  'char_bigram(_ε)': 6,
  'char_bigram(_κ)': 1,
  'char_bigram(_λ)': 1,
  'char_bigram(_μ)': 2,
  'char_bigram(_ν)': 1,
  'char_bigram(_π)': 1,
  'char_bigram(_τ)': 7,
  'char_bigram(_φ)': 1,
  'char_bigram(α_)': 5,
  'char_bigram(αδ)': 1,
  'char_bigram(αθ)': 1,
  'char_bigram(αι)': 1,
  'char_bigram(ακ)': 1,
  'char_bigram(αλ)': 1,
  'char_bigram(αμ)': 2,
  'char_bigram(ασ)': 3,
  'char_bigram(ατ)': 2,
  'char_bigram(αφ)': 1,
  'char_bigram(γε)': 1,
  'char_bigram(δα)': 1,
  'char_bigram(δε)': 1,
  'char_bigram(ε_)': 7,
  'char_bigram(ει)': 4,
  'char_bigram(εμ)': 2,
  'char_bigram(εν)': 1,
  'char_bigram(επ)':

## 7. Building a Naive Bayes Classifier

In [215]:
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_set)

  """


In [216]:
classifier.labels()

['cg', 'smg']

In [217]:
from nltk.classify import accuracy

round(accuracy(classifier, test_set), 2)

0.53

In [218]:
classifier.show_most_informative_features(10)

Most Informative Features
       char_trigram(ρωτ) = 1                 smg : cg     =     13.0 : 1.0
       char_trigram(_δε) = 1                 smg : cg     =     12.9 : 1.0
   char_quadrigram(λου_) = 1                 smg : cg     =     11.0 : 1.0
   char_quadrigram(_ειν) = 1                 smg : cg     =     11.0 : 1.0
   char_quadrigram(ιναι) = 1                 smg : cg     =     11.0 : 1.0
             word(ειναι) = 1                 smg : cg     =     11.0 : 1.0
       char_trigram(εχε) = 1                 smg : cg     =     11.0 : 1.0
         char_bigram(ωτ) = 1                 smg : cg     =     10.2 : 1.0
       char_trigram(σαι) = 1                 smg : cg     =      9.0 : 1.0
         char_bigram(ρυ) = 1                 smg : cg     =      9.0 : 1.0


In [219]:
from io import StringIO
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk import everygrams

def sent_process(sent):
    return [''.join(ng) for ng in everygrams(sent.replace(' ', '_ _'), 1, 4) 
            if ' ' not in ng and '\n' not in ng and ng != ('_',)]

sent_process('αυτο τεστ')

['α',
 'υ',
 'τ',
 'ο',
 'τ',
 'ε',
 'σ',
 'τ',
 'αυ',
 'υτ',
 'το',
 'ο_',
 '_τ',
 'τε',
 'εσ',
 'στ',
 'αυτ',
 'υτο',
 'το_',
 '_τε',
 'τεσ',
 'εστ',
 'αυτο',
 'υτο_',
 '_τεσ',
 'τεστ']

In [220]:
sent1 = "εννεν ετσι ρα εν το πιστευκω εγιω"
sent2 = "εν ετσι εγιω πιστευκω"
sent3 = 'πιστευκω σε εν ετσι'
sent4 = 'οπα ρε φιλε'
sent5 = 'οπα ρε φιλε θα σε σκωτωσω'
with StringIO('\n'.join([sent1, sent2])) as fin:
    # Override the analyzer totally with our preprocess text
    count_vect = CountVectorizer(analyzer=sent_process)
    count_vect.fit_transform(fin)
count_vect.vocabulary_ 

{'_ε': 0,
 '_εγ': 1,
 '_εγι': 2,
 '_εν': 3,
 '_εν_': 4,
 '_ετ': 5,
 '_ετσ': 6,
 '_π': 7,
 '_πι': 8,
 '_πισ': 9,
 '_ρ': 10,
 '_ρα': 11,
 '_ρα_': 12,
 '_τ': 13,
 '_το': 14,
 '_το_': 15,
 'α': 16,
 'α_': 17,
 'γ': 18,
 'γι': 19,
 'γιω': 20,
 'γιω_': 21,
 'ε': 22,
 'εγ': 23,
 'εγι': 24,
 'εγιω': 25,
 'εν': 26,
 'εν_': 27,
 'ενν': 28,
 'εννε': 29,
 'ετ': 30,
 'ετσ': 31,
 'ετσι': 32,
 'ευ': 33,
 'ευκ': 34,
 'ευκω': 35,
 'ι': 36,
 'ι_': 37,
 'ισ': 38,
 'ιστ': 39,
 'ιστε': 40,
 'ιω': 41,
 'ιω_': 42,
 'κ': 43,
 'κω': 44,
 'κω_': 45,
 'ν': 46,
 'ν_': 47,
 'νε': 48,
 'νεν': 49,
 'νεν_': 50,
 'νν': 51,
 'ννε': 52,
 'ννεν': 53,
 'ο': 54,
 'ο_': 55,
 'π': 56,
 'πι': 57,
 'πισ': 58,
 'πιστ': 59,
 'ρ': 60,
 'ρα': 61,
 'ρα_': 62,
 'σ': 63,
 'σι': 64,
 'σι_': 65,
 'στ': 66,
 'στε': 67,
 'στευ': 68,
 'τ': 69,
 'τε': 70,
 'τευ': 71,
 'τευκ': 72,
 'το': 73,
 'το_': 74,
 'τσ': 75,
 'τσι': 76,
 'τσι_': 77,
 'υ': 78,
 'υκ': 79,
 'υκω': 80,
 'υκω_': 81,
 'ω': 82,
 'ω_': 83}

In [221]:
count_vect = CountVectorizer(analyzer=sent_process)

In [222]:
train_set = count_vect.fit_transform([sent1, sent2, sent5])
train_set.toarray()

array([[3, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 6, 0, 1, 1, 1, 3, 2, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 0, 3, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
        1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 3, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 2, 1, 0, 0, 0,
        0, 0],
       [2, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 4, 0, 1, 1, 1, 1, 1, 0, 0,
        1, 1, 1, 1, 1, 1, 0, 0, 0, 3, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 2, 1, 1,
        1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 2, 1, 1, 1,
        

In [223]:
# To train the classifier
clf = MultinomialNB()
clf.fit(train_set, ['cg', 'cg', 'smg']) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [224]:
test_set = count_vect.transform([sent3, sent4]) # comment on diff. between fit trans and trans
clf.predict(test_set)

array(['cg', 'smg'], dtype='<U3')

In [241]:
def show_most_informative_features(vectorizer, clf, n=10):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    print("\tCG\t\t\t\tSMG\n")
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
show_most_informative_features(count_vect, clf)

	CG				SMG

	-9.4189	_αγ            		-3.8131	α              
	-9.4189	_αγγ           		-4.1008	ε              
	-9.4189	_αδ            		-4.1408	ο              
	-9.4189	_αδε           		-4.1932	ι              
	-9.4189	_ακ            		-4.3500	τ              
	-9.4189	_ακο           		-4.6567	ν              
	-9.4189	_ακρ           		-4.8442	π              
	-9.4189	_αλη           		-4.9762	ρ              
	-9.4189	_ανο           		-4.9762	λ              
	-9.4189	_απι           		-4.9762	α_             


Will use sklearn cuz NLTK can't handle ints: https://stackoverflow.com/questions/49600319/nltk-classifier-for-integer-features

In [226]:
all_sents_labeled = ([(sentence, 'cg') for sentence in cg_sents_clean] + [(sentence, 'smg') for sentence in smg_sents_clean])
random.shuffle(all_sents_labeled)
train_set = all_sents_labeled[:NO_TRAIN_SENTENCES]
test_set = all_sents_labeled[NO_TRAIN_SENTENCES:]
train_set_sents = [sent[0] for sent in train_set]
train_set_labels = [sent[1] for sent in train_set]
test_set_sents = [sent[0] for sent in test_set]
test_set_labels = [sent[1] for sent in test_set]
print(len(train_set), len(test_set))
print(train_set_sents[0], train_set_labels[0])

122 30
ο ενας παππους αποστολος ο αλλος παππους αντρεας μαντεψε τι εφκαλαν το μωρο cg


In [227]:
train_set_vectors = count_vect.fit_transform(train_set_sents)
import numpy
numpy.set_printoptions(threshold=numpy.nan)
train_set_vectors.toarray()[0]

array([ 0,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  1,
        0,  0,  0,  0,  1,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [228]:
train_set_vectors

<122x4425 sparse matrix of type '<class 'numpy.int64'>'
	with 17859 stored elements in Compressed Sparse Row format>

In [229]:
count_vect.vocabulary_ # The numbers are not counts but indices

{'ο': 2838,
 'ε': 1213,
 'ν': 2626,
 'α': 586,
 'ς': 3597,
 'π': 3129,
 'υ': 3991,
 'σ': 3607,
 'τ': 3781,
 'λ': 2275,
 'ρ': 3344,
 'μ': 2455,
 'ψ': 4332,
 'ι': 1807,
 'φ': 4161,
 'κ': 2100,
 'ω': 4353,
 'ο_': 2839,
 '_ε': 114,
 'εν': 1350,
 'να': 2628,
 'ας': 877,
 'ς_': 3598,
 '_π': 381,
 'πα': 3134,
 'απ': 807,
 'ππ': 3301,
 'πο': 3254,
 'ου': 3061,
 'υς': 4107,
 '_α': 1,
 'οσ': 3019,
 'στ': 3728,
 'το': 3906,
 'ολ': 2906,
 'λο': 2406,
 'ος': 3014,
 '_ο': 356,
 'αλ': 700,
 'λλ': 2385,
 'αν': 766,
 'ντ': 2756,
 'τρ': 3939,
 'ρε': 3399,
 'εα': 1215,
 '_μ': 285,
 'μα': 2457,
 'τε': 3807,
 'εψ': 1550,
 'ψε': 4335,
 'ε_': 1214,
 '_τ': 482,
 'τι': 3874,
 'ι_': 1808,
 'εφ': 1523,
 'φκ': 4208,
 'κα': 2102,
 'λα': 2277,
 'ν_': 2627,
 'μω': 2614,
 'ωρ': 4382,
 'ρο': 3505,
 '_εν': 148,
 'ενα': 1352,
 'νας': 2650,
 'ας_': 878,
 '_πα': 383,
 'παπ': 3153,
 'αππ': 828,
 'ππο': 3304,
 'που': 3290,
 'ους': 3090,
 'υς_': 4108,
 '_απ': 24,
 'απο': 821,
 'ποσ': 3283,
 'οστ': 3031,
 'στο': 3745,
 'τολ':

In [230]:
len(count_vect.vocabulary_)

4425

In [231]:
clf = MultinomialNB() # There don't seem to be params for MultinomialDB that prevent overfitting, so my problem is likely caused by the small dataset
clf.fit(train_set_vectors, train_set_labels) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [232]:
from sklearn.metrics import *

test_set_vectors = count_vect.transform(test_set_sents)
accuracy_score(test_set_labels, clf.predict(test_set_vectors))

0.8333333333333334

In [242]:
show_most_informative_features(count_vect, clf, n=20)

	CG				SMG

	-9.4189	_αγ            		-3.8131	α              
	-9.4189	_αγγ           		-4.1008	ε              
	-9.4189	_αδ            		-4.1408	ο              
	-9.4189	_αδε           		-4.1932	ι              
	-9.4189	_ακ            		-4.3500	τ              
	-9.4189	_ακο           		-4.6567	ν              
	-9.4189	_ακρ           		-4.8442	π              
	-9.4189	_αλη           		-4.9762	ρ              
	-9.4189	_ανο           		-4.9762	λ              
	-9.4189	_απι           		-4.9762	α_             
	-9.4189	_αρε           		-5.0622	σ              
	-9.4189	_ασπ           		-5.0882	υ              
	-9.4189	_ατ            		-5.1148	η              
	-9.4189	_ατε           		-5.1562	μ              
	-9.4189	_ατο           		-5.1562	κ              
	-9.4189	_αυκ           		-5.2758	ς              
	-9.4189	_αυρ           		-5.3758	ι_             
	-9.4189	_αφη           		-5.4299	ς_             
	-9.4189	_αφκ           		-5.4486	_π             
	-9.4189	_βαρ           		-5.5271	ου  

In [252]:
test_vec = count_vect.transform(['η κυπρος εν που τες πιο ομορφες χωρες', 'η κυπρος ειναι απο τις πιο ομορφες χωρες'])
clf.predict(test_vec)

array(['cg', 'smg'], dtype='<U3')