# Building the Classifier

## 1. Loading the corpus

In [1]:
import re
from nltk import sent_tokenize

cg_sents = []
smg_sents = []

def remove_duplicate_punctuation(s):
    return(re.sub(r'([\.\?!;])\1+', r'\1 ', s))

with open('./Data/cg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])([\.\?!;])([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])([\.\?!;])([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])([\.\?!;])([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)

with open('./Data/smg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])([\.\?!;])([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])([\.\?!;])([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])([\.\?!;])([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)

cg_sents[:3]

['Πρασινο αυκουι μες το πασχαλινο ποτήρι που έπιασε ο μιτσης #αισχος 🤣🤣🤣   @HARRIS_APOEL https://t.co/y9X7CmBEa5',
 '@HARRIS_APOEL @pirpoitis @vassrules Καμνουν πολλα ανακαινιση στα Περβολια .',
 '@MUFCChristian Ελα συγγενη τζιαι εχουμε νεοτερα π το Νικολη.']

## 2. Cleaning the text

In [2]:
import unicodedata
from string import punctuation
from nltk.tokenize import WhitespaceTokenizer

punctuation += '´΄’…“”–—―»«'

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def get_clean_sent_el(sentence):
    sentence = re.sub(r'^RT', '', sentence)
    sentence = re.sub(r'\&\w*;', '', sentence)
    sentence = re.sub(r'\@\w*', '', sentence)
    sentence = re.sub(r'\$\w*', '', sentence)
    sentence = re.sub(r'https?:\/\/.*\/\w*', '', sentence)
    sentence = ''.join(c for c in sentence if c <= '\uFFFF')
    sentence = strip_accents(sentence)
    sentence = re.sub(r'#\w*', '', sentence)
    sentence = sentence.lower()
    tokens = WhitespaceTokenizer().tokenize(sentence)
    new_tokens = []
    for token in tokens:
        if token == 'ο,τι' or token == 'ό,τι' or token == 'o,ti' or token == 'ó,ti':
            new_tokens.append(token)
        else:
            token = re.sub(r'(?<=[.,!\?;\'΄´])(?=[^\s])', r' ', token)
            new_token = token.translate(str.maketrans({key: None for key in punctuation}))
            if (new_token != ''):
                new_tokens.append(new_token)
    sentence =' '.join(new_tokens)
    sentence = sentence.replace('\ufeff', '')
    sentence = sentence.strip(' ')
    sentence = sentence.replace('  ', ' ')
    return sentence

cg_sents_clean = []
smg_sents_clean = []

for sent in cg_sents:
    cg_sents_clean.append(get_clean_sent_el(sent))
for sent in smg_sents:
    smg_sents_clean.append(get_clean_sent_el(sent))

cg_sents_clean = list(filter(None, cg_sents_clean))
smg_sents_clean = list(filter(None, smg_sents_clean))
cg_sents_clean[:3]

['πρασινο αυκουι μες το πασχαλινο ποτηρι που επιασε ο μιτσης',
 'καμνουν πολλα ανακαινιση στα περβολια',
 'ελα συγγενη τζιαι εχουμε νεοτερα π το νικολη']

## 3. Building the feature extractor

In [3]:
from nltk import ngrams

def get_word_ngrams(tokens, n):
    ngrams_list = []
    ngrams_list.append(list(ngrams(tokens, n)))
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = '%s'
    for _ in range(1, n):
        format_string += (' %s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

def get_char_ngrams(word, n):
    ngrams_list = []
    word = word.replace('ς', 'σ')
    ngrams_list.append(list(ngrams(word, n, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_')))
    
    # Removing redundant ngrams:
    if (n > 2):
        redundant_combinations = n - 2
        ngrams_list = [ngram_list[redundant_combinations : -redundant_combinations] for ngram_list in ngrams_list]
    
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = ''
    for i in range(0, n):
        format_string += ('%s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

In [4]:
# Feature extractor
def get_ngram_features(sent): # The reason I do not use NLTK's everygrams to extract the features quickly is because the behavior of my n-gram extractor is modified to remove redundant n-grams. Also, I need to label word and char n-grams to avoid ambiguity
    sentence_tokens = WhitespaceTokenizer().tokenize(sent)
    
    features = {}
    
    # Word unigrams
    ngrams = get_word_ngrams(sentence_tokens, 1)
    for ngram in ngrams:
        features[f'word({ngram})'] = features.get(f'word({ngram})', 0) + 1 # The second parameter to .get() is a default value if the key doesn't exist.
    
    # Word bigrams
    ngrams = get_word_ngrams(sentence_tokens, 2)
    for ngram in ngrams:
        features[f'word_bigram({ngram})'] = features.get(f'word_bigram({ngram})', 0) + 1
    
    # Char unigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 1)
        for ngram in ngrams:
            features[f'char({ngram})'] = features.get(f'char({ngram})', 0) + 1
    
    # Char bigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 2)
        for ngram in ngrams:
            features[f'char_bigram({ngram})'] = features.get(f'char_bigram({ngram})', 0) + 1
    
    # Char trigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 3)
        for ngram in ngrams:
            features[f'char_trigram({ngram})'] = features.get(f'char_trigram({ngram})', 0) + 1
    
    return features

get_ngram_features('αυτη ειναι η σπαρτη')

{'char(α)': 3,
 'char(ε)': 1,
 'char(η)': 3,
 'char(ι)': 2,
 'char(ν)': 1,
 'char(π)': 1,
 'char(ρ)': 1,
 'char(σ)': 1,
 'char(τ)': 2,
 'char(υ)': 1,
 'char_bigram(_α)': 1,
 'char_bigram(_ε)': 1,
 'char_bigram(_η)': 1,
 'char_bigram(_σ)': 1,
 'char_bigram(αι)': 1,
 'char_bigram(αρ)': 1,
 'char_bigram(αυ)': 1,
 'char_bigram(ει)': 1,
 'char_bigram(η_)': 3,
 'char_bigram(ι_)': 1,
 'char_bigram(ιν)': 1,
 'char_bigram(να)': 1,
 'char_bigram(πα)': 1,
 'char_bigram(ρτ)': 1,
 'char_bigram(σπ)': 1,
 'char_bigram(τη)': 2,
 'char_bigram(υτ)': 1,
 'char_trigram(_αυ)': 1,
 'char_trigram(_ει)': 1,
 'char_trigram(_η_)': 1,
 'char_trigram(_σπ)': 1,
 'char_trigram(αι_)': 1,
 'char_trigram(αρτ)': 1,
 'char_trigram(αυτ)': 1,
 'char_trigram(ειν)': 1,
 'char_trigram(ινα)': 1,
 'char_trigram(ναι)': 1,
 'char_trigram(παρ)': 1,
 'char_trigram(ρτη)': 1,
 'char_trigram(σπα)': 1,
 'char_trigram(τη_)': 2,
 'char_trigram(υτη)': 1,
 'word(αυτη)': 1,
 'word(ειναι)': 1,
 'word(η)': 1,
 'word(σπαρτη)': 1,
 'word_bigra

In [5]:
# from nltk import everygrams

# def sent_process(sent):
#     return [''.join(ng) for ng in everygrams(sent.replace(' ', '_ _'), 1, 4) 
#             if ' ' not in ng and '\n' not in ng and ng != ('_',)]

# sent_process('αυτη ειναι η σπαρτη')

## 4. Creating the training and test sets

In [6]:
import random

all_sents_labeled = ([(sentence, 'CG') for sentence in cg_sents_clean] + [(sentence, 'SMG') for sentence in smg_sents_clean])
random.shuffle(all_sents_labeled)
all_sents_labeled[0]

('ουτε ενα μαντορινι εν τους αφηννε να κοψουν ρε ο ππιντης', 'CG')

In [7]:
NO_ALL_SENTENCES = len(all_sents_labeled)
NO_TRAIN_SENTENCES = round(NO_ALL_SENTENCES * .8)

train_set = all_sents_labeled[:NO_TRAIN_SENTENCES]
test_set = all_sents_labeled[NO_TRAIN_SENTENCES:]

train_set_sents = [sent[0] for sent in train_set]
train_set_labels = [sent[1] for sent in train_set]
test_set_sents = [sent[0] for sent in test_set]
test_set_labels = [sent[1] for sent in test_set]

print(train_set_sents[0], train_set_labels[0])

ουτε ενα μαντορινι εν τους αφηννε να κοψουν ρε ο ππιντης CG


In [8]:
print('DATASET\t', 'SENTENCES')
print('All\t', NO_ALL_SENTENCES)
print('Training', NO_TRAIN_SENTENCES)
print('Testing\t', NO_ALL_SENTENCES - NO_TRAIN_SENTENCES)

DATASET	 SENTENCES
All	 1039
Training 831
Testing	 208


## 5. Vectorization

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=get_ngram_features)

train_set_vectors = count_vect.fit_transform(train_set_sents)
test_set_vectors = count_vect.transform(test_set_sents) # Unlike fit_transform(), transform() does not change the count vectorizer's vocabulary so it should be used for the test set.
train_set_vectors

<831x16007 sparse matrix of type '<class 'numpy.int64'>'
	with 124849 stored elements in Compressed Sparse Row format>

In [10]:
from numpy import set_printoptions, nan
set_printoptions(threshold=nan) # Prints whole array. Required because by default an array with thousands of elements wouldn't be printed in full.

train_set_vectors.toarray()[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [11]:
count_vect.vocabulary_ # The numbers are not counts but indices.

{'word(ουτε)': 5944,
 'word(ενα)': 4569,
 'word(μαντορινι)': 5548,
 'word(εν)': 4568,
 'word(τους)': 6854,
 'word(αφηννε)': 3932,
 'word(να)': 5733,
 'word(κοψουν)': 5371,
 'word(ρε)': 6432,
 'word(ο)': 5842,
 'word(ππιντης)': 6320,
 'word_bigram(ουτε ενα)': 12658,
 'word_bigram(ενα μαντορινι)': 9189,
 'word_bigram(μαντορινι εν)': 11164,
 'word_bigram(εν τους)': 9163,
 'word_bigram(τους αφηννε)': 15417,
 'word_bigram(αφηννε να)': 7877,
 'word_bigram(να κοψουν)': 11923,
 'word_bigram(κοψουν ρε)': 10872,
 'word_bigram(ρε ο)': 13638,
 'word_bigram(ο ππιντης)': 12279,
 'char(ο)': 22,
 'char(υ)': 27,
 'char(τ)': 26,
 'char(ε)': 12,
 'char(ν)': 20,
 'char(α)': 8,
 'char(μ)': 19,
 'char(ρ)': 24,
 'char(ι)': 16,
 'char(σ)': 25,
 'char(φ)': 28,
 'char(η)': 14,
 'char(κ)': 17,
 'char(ψ)': 30,
 'char(π)': 23,
 'char_bigram(_ο)': 50,
 'char_bigram(ου)': 313,
 'char_bigram(υτ)': 408,
 'char_bigram(τε)': 378,
 'char_bigram(ε_)': 134,
 'char_bigram(_ε)': 40,
 'char_bigram(εν)': 146,
 'char_bigram(να)

In [12]:
len(count_vect.vocabulary_) # This is the same as the length of each vector.

16007

## 6. Building the classifiers

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def show_confusion_matrix(cm):
    print('\t         Predicted')
    print('\t        CG       SMG')
    print('\t     -------- --------')
    print('\tCG  | {:^6} | {:^6}'.format(cm[0][0], cm[0][1]))
    print('Actual\t     -------- --------')
    print('\tSMG | {:^6} | {:^6}'.format(cm[1][0], cm[1][1]))

def show_most_informative_features(vectorizer, clf, n=10):
    print("\t\t    CG\t\t\t\t\t\t    SMG\n")
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%17s\t\t\t%.4f\t%17s" % (coef_1, fn_1, coef_2, fn_2))

### 6.1 Multinomial Naive Bayes classifier

In [14]:
clf_multinomialNB = MultinomialNB() # There are no params for MultinomialDB that prevent overfitting, so any overfitting is caused by the small dataset size.
clf_multinomialNB.fit(train_set_vectors, train_set_labels) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
clf_multinomialNB_predictions = clf_multinomialNB.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_multinomialNB_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_multinomialNB_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_multinomialNB_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.96 

             precision    recall  f1-score   support

         CG       0.94      0.98      0.96       109
        SMG       0.98      0.93      0.95        99

avg / total       0.96      0.96      0.96       208

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  107   |   2   
Actual	     -------- --------
	SMG |   7    |   92  


In [16]:
show_most_informative_features(count_vect, clf_multinomialNB, n=20)

		    CG						    SMG

	-11.2815	  char_bigram(αα)			-5.3154	          char(α)
	-11.2815	  char_bigram(β_)			-5.3231	          char(ι)
	-11.2815	  char_bigram(δ_)			-5.3309	          char(ο)
	-11.2815	  char_bigram(δκ)			-5.3335	          char(ε)
	-11.2815	  char_bigram(ηα)			-5.3413	          char(τ)
	-11.2815	  char_bigram(ηβ)			-5.3466	          char(ν)
	-11.2815	  char_bigram(ηε)			-5.3519	          char(σ)
	-11.2815	  char_bigram(ηο)			-5.4010	  char_bigram(α_)
	-11.2815	  char_bigram(θθ)			-5.4122	          char(υ)
	-11.2815	  char_bigram(θκ)			-5.4179	          char(ρ)
	-11.2815	  char_bigram(ιυ)			-5.4293	          char(κ)
	-11.2815	  char_bigram(οζ)			-5.4467	          char(μ)
	-11.2815	  char_bigram(πδ)			-5.4526	          char(π)
	-11.2815	  char_bigram(πκ)			-5.4644	  char_bigram(ι_)
	-11.2815	  char_bigram(ππ)			-5.4885	          char(η)
	-11.2815	  char_bigram(φκ)			-5.4915	          char(λ)
	-11.2815	  char_bigram(ωε)			-5.5008	  char_bigram(_τ)
	-11.2815	char_trigram(_α

### 6.2 Linear Support Vector classifier

In [17]:
clf_linearSVC = LinearSVC(max_iter=1500) # n_samples < n_features in training set so the dual param is kept at its default value of True. Default max_iter = 1000
clf_linearSVC.fit(train_set_vectors, train_set_labels) 

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1500,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [18]:
clf_linearSVC_predictions = clf_linearSVC.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_linearSVC_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_linearSVC_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_linearSVC_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.91 

             precision    recall  f1-score   support

         CG       0.92      0.90      0.91       109
        SMG       0.89      0.92      0.91        99

avg / total       0.91      0.91      0.91       208

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |   98   |   11  
Actual	     -------- --------
	SMG |   8    |   91  


In [19]:
show_most_informative_features(count_vect, clf_linearSVC, n=20)

		    CG						    SMG

	-0.3471	char_trigram(εν_)			0.1548	        word(και)
	-0.3013	         word(εν)			0.1405	  char_bigram(λυ)
	-0.2699	  char_bigram(τζ)			0.1393	char_trigram(αλλ)
	-0.2593	char_trigram(_τζ)			0.1342	  char_bigram(_β)
	-0.2262	  char_bigram(αμ)			0.1301	        word(δεν)
	-0.1907	  char_bigram(_ε)			0.1301	char_trigram(δεν)
	-0.1864	  char_bigram(θκ)			0.1296	char_trigram(τασ)
	-0.1544	char_trigram(καμ)			0.1294	char_trigram(ινα)
	-0.1523	char_trigram(_ου)			0.1252	  char_bigram(να)
	-0.1464	char_trigram(_εν)			0.1212	  char_bigram(_ξ)
	-0.1449	          char(π)			0.1206	          char(χ)
	-0.1382	char_trigram(θκι)			0.1147	  char_bigram(τα)
	-0.1362	          char(ζ)			0.1132	char_trigram(_τα)
	-0.1335	  char_bigram(_λ)			0.1106	  char_bigram(ικ)
	-0.1331	char_trigram(_λα)			0.1078	char_trigram(ιστ)
	-0.1299	  char_bigram(φκ)			0.1074	          char(δ)
	-0.1257	  char_bigram(νν)			0.1070	char_trigram(θελ)
	-0.1216	char_trigram(αμα)			0.1070	char_trigram(μαλ)
	-0.1

### 6.3 Logistic Regression classifier

In [20]:
clf_logisticRegression = LogisticRegression() # Again, dual = True. Default solver = 'liblinear'. It's recommended for smaller databases. For bigger databases, 'saga' could be used.
clf_logisticRegression.fit(train_set_vectors, train_set_labels) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
clf_logisticRegression_predictions = clf_logisticRegression.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_logisticRegression_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_logisticRegression_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_logisticRegression_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.91 

             precision    recall  f1-score   support

         CG       0.94      0.89      0.92       109
        SMG       0.89      0.94      0.91        99

avg / total       0.92      0.91      0.91       208

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |   97   |   12  
Actual	     -------- --------
	SMG |   6    |   93  


In [22]:
show_most_informative_features(count_vect, clf_logisticRegression, n=20)

		    CG						    SMG

	-1.2587	char_trigram(εν_)			0.6666	        word(και)
	-1.1392	         word(εν)			0.4642	char_trigram(δεν)
	-1.0289	  char_bigram(τζ)			0.4625	  char_bigram(λυ)
	-0.9605	char_trigram(_τζ)			0.4611	        word(δεν)
	-0.8454	  char_bigram(αμ)			0.4540	char_trigram(ινα)
	-0.6719	  char_bigram(θκ)			0.4383	  char_bigram(_β)
	-0.6625	  char_bigram(_ε)			0.4321	  char_bigram(να)
	-0.6533	char_trigram(_εν)			0.4241	          char(χ)
	-0.5861	char_trigram(καμ)			0.4208	          char(δ)
	-0.5038	          char(ζ)			0.4150	char_trigram(και)
	-0.4929	char_trigram(θκι)			0.4109	char_trigram(αλλ)
	-0.4926	char_trigram(τζι)			0.4021	  char_bigram(τα)
	-0.4795	char_trigram(_ου)			0.3963	char_trigram(_απ)
	-0.4702	  char_bigram(ζι)			0.3951	  char_bigram(ικ)
	-0.4546	  char_bigram(φκ)			0.3829	char_trigram(τασ)
	-0.4521	char_trigram(αμα)			0.3620	char_trigram(ιστ)
	-0.4311	          char(π)			0.3611	      word(ειναι)
	-0.4307	char_trigram(_λα)			0.3548	char_trigram(αυτ)
	-0.4

**It seems that the classification algorithm with the best performance is *Multinomial Naive Bayes***.

## 7. Analyzing misclassifications made by the Multinomial Naive Bayes classifier

In [23]:
print('MISCLASSIFICATIONS\n')

misclassificationCount = 0

for i, sent in enumerate(test_set_sents):
    if test_set_labels[i] != clf_multinomialNB_predictions[i]:
        misclassificationCount += 1
        print(f'{misclassificationCount}.', sent, f'(CORRECT = {test_set_labels[i]},', f'PREDICTED = {clf_multinomialNB_predictions[i]})\n')

MISCLASSIFICATIONS

1. ολοι ετσι κανουν (CORRECT = SMG, PREDICTED = CG)

2. μιλω σας ποιος με ειδεν και δεν με φοβηθηκε (CORRECT = CG, PREDICTED = SMG)

3. το προβλημα με το παρανομο κυνηγι εξω εχει παραγινει (CORRECT = SMG, PREDICTED = CG)

4. αν εμεις δεν καταφεραμε τοτε οσα λαχταρησαμε δεν ξεσπαμε πανω τους (CORRECT = SMG, PREDICTED = CG)

5. παστιτσιο και παντζαρια σαλατα το μενου για σημερα καλη μας ορεξη (CORRECT = SMG, PREDICTED = CG)

6. πολλα φοουμαι πως το ακελ κατερριψε και αυτο (CORRECT = CG, PREDICTED = SMG)

7. ισως γιατι δεν θυμομαστε καλα (CORRECT = SMG, PREDICTED = CG)

8. τελικα οι πορνες δεν ηταν και τοσο αλλοδαπες αλλα τι σημασια εχει η δουλεια εγινε (CORRECT = SMG, PREDICTED = CG)

9. στη ζωη μου εισαι γουρι στο τζατζικι το αγγουρι (CORRECT = SMG, PREDICTED = CG)



## 8. Trying the Multinomial Naive Bayes classifier with custom input

First, a more powerful version of the classifier is built by using all the data available:

In [24]:
full_set_sents = [sent[0] for sent in all_sents_labeled]
full_set_labels = [sent[1] for sent in all_sents_labeled]
full_set_vectors = count_vect.fit_transform(full_set_sents)

clf_super_multinomialNB = MultinomialNB()
clf_super_multinomialNB.fit(full_set_vectors, full_set_labels) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Trying 2 custom sentences:

In [69]:
cgSent = 'Η Κύπρος εν που τες πιο όμορφες χώρες.'
smgSent = 'Η Κύπρος είναι από τις πιο όμορφες χώρες.'

demoSentences = [cgSent, smgSent]

cgSent = get_clean_sent_el(cgSent)
smgSent = get_clean_sent_el(smgSent)

test_vec = count_vect.transform([cgSent, smgSent])

for sentenceNumber, predictionArr in enumerate(clf_super_multinomialNB.predict_proba(test_vec)):
    print(f'SENTENCE {sentenceNumber + 1}: “{demoSentences[sentenceNumber]}”')
    if predictionArr[0] > predictionArr[1]:
        print(f'PREDICTION: Cypriot Greek (Confidence: {predictionArr[0]:.2f})\n')
    else:
        print(f'PREDICTION: Standard Modern Greek (Confidence: {predictionArr[1]:.2f})\n')

SENTENCE 1: “Η Κύπρος εν που τες πιο όμορφες χώρες.”
PREDICTION: Cypriot Greek (Confidence: 1.00)

SENTENCE 2: “Η Κύπρος είναι από τις πιο όμορφες χώρες.”
PREDICTION: Standard Modern Greek (Confidence: 1.00)

