# Building the Classifier

## 1. Loading the corpus

In [88]:
import re
from nltk import sent_tokenize

cg_sents = []
smg_sents = []

def remove_duplicate_punctuation(s):
    return(re.sub(r'(\.|\?|!|;)\1+', r'\1 ', s))

with open('./Data/cg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])(\.|\?|;|!)([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])(\.|\?|;|!)([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])(\.|\?|;|!)([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)

with open('./Data/smg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])(\.|\?|;|!)([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])(\.|\?|;|!)([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])(\.|\?|;|!)([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)

cg_sents[:3]

['Πρασινο αυκουι μες το πασχαλινο ποτήρι που έπιασε ο μιτσης #αισχος 🤣🤣🤣   @HARRIS_APOEL https://t.co/y9X7CmBEa5',
 '@HARRIS_APOEL @pirpoitis @vassrules Καμνουν πολλα ανακαινιση στα Περβολια .',
 '@MUFCChristian Ελα συγγενη τζιαι εχουμε νεοτερα π το Νικολη.']

## 2. Cleaning the text

In [89]:
import unicodedata
from string import punctuation
from nltk.tokenize import WhitespaceTokenizer

punctuation += '´΄’…“”–—―»«'

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def get_clean_sent_el(sentence):
    sentence = re.sub(r'^RT', '', sentence)
    sentence = re.sub(r'\&\w*;', '', sentence)
    sentence = re.sub(r'\@\w*', '', sentence)
    sentence = re.sub(r'\$\w*', '', sentence)
    sentence = re.sub(r'https?:\/\/.*\/\w*', '', sentence)
    sentence = ''.join(c for c in sentence if c <= '\uFFFF')
    sentence = strip_accents(sentence)
    sentence = re.sub(r'#\w*', '', sentence)
    sentence = sentence.lower()
    tokens = WhitespaceTokenizer().tokenize(sentence)
    new_tokens = []
    for token in tokens:
        if token == 'ο,τι' or token == 'ό,τι' or token == 'o,ti' or token == 'ó,ti':
            new_tokens.append(token)
        else:
            token = re.sub(r'(?<=[.,!\?;\'΄´])(?=[^\s])', r' ', token)
            new_token = token.translate(str.maketrans({key: None for key in punctuation}))
            if (new_token != ''):
                new_tokens.append(new_token)
    sentence =' '.join(new_tokens)
    sentence = re.sub('\ufeff', '', sentence)
    sentence = sentence.strip(' ')
    sentence = re.sub('  ', ' ', sentence)
    return sentence

cg_sents_clean = []
smg_sents_clean = []

for sent in cg_sents:
    cg_sents_clean.append(get_clean_sent_el(sent))
for sent in smg_sents:
    smg_sents_clean.append(get_clean_sent_el(sent))

cg_sents_clean = list(filter(None, cg_sents_clean))
smg_sents_clean = list(filter(None, smg_sents_clean))
cg_sents_clean[:3]

['πρασινο αυκουι μες το πασχαλινο ποτηρι που επιασε ο μιτσης',
 'καμνουν πολλα ανακαινιση στα περβολια',
 'ελα συγγενη τζιαι εχουμε νεοτερα π το νικολη']

## 3. Building the feature extractor

In [90]:
from nltk import ngrams

def get_word_ngrams(tokens, n):
    ngrams_list = []
    ngrams_list.append(list(ngrams(tokens, n)))
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = '%s'
    for i in range(1, n):
        format_string += (' %s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

def get_char_ngrams(word, n):
    ngrams_list = []
    word = re.sub(r'ς', 'σ', word)
    ngrams_list.append(list(ngrams(word, n, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_')))
    
    # Removing redundant ngrams:
    if (n > 2):
        redundant_combinations = n - 2
        ngrams_list = [ngram_list[redundant_combinations : -redundant_combinations] for ngram_list in ngrams_list]
    
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = ''
    for i in range(0, n):
        format_string += ('%s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

In [91]:
# Feature extractor
def get_ngram_features(sent): # The reason I do not use NLTK's everygrams to extract the features quickly is because the behavior of my n-gram extractor is modified to remove redundant n-grams. Also, I need to label word and char n-grams to avoid ambiguity
    sentence_tokens = WhitespaceTokenizer().tokenize(sent)
    
    features = {}
    
    # Word unigrams
    ngrams = get_word_ngrams(sentence_tokens, 1)
    for ngram in ngrams:
        features[f'word({ngram})'] = features.get(f'word({ngram})', 0) + 1 # The second parameter to .get() is a default value if the key doesn't exist.
    
    # Word bigrams
    ngrams = get_word_ngrams(sentence_tokens, 2)
    for ngram in ngrams:
        features[f'word_bigram({ngram})'] = features.get(f'word_bigram({ngram})', 0) + 1
    
    # Char unigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 1)
        for ngram in ngrams:
            features[f'char({ngram})'] = features.get(f'char({ngram})', 0) + 1
    
    # Char bigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 2)
        for ngram in ngrams:
            features[f'char_bigram({ngram})'] = features.get(f'char_bigram({ngram})', 0) + 1
    
    # Char trigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 3)
        for ngram in ngrams:
            features[f'char_trigram({ngram})'] = features.get(f'char_trigram({ngram})', 0) + 1
    
    return features

get_ngram_features('αυτη ειναι η σπαρτη')

{'char(α)': 3,
 'char(ε)': 1,
 'char(η)': 3,
 'char(ι)': 2,
 'char(ν)': 1,
 'char(π)': 1,
 'char(ρ)': 1,
 'char(σ)': 1,
 'char(τ)': 2,
 'char(υ)': 1,
 'char_bigram(_α)': 1,
 'char_bigram(_ε)': 1,
 'char_bigram(_η)': 1,
 'char_bigram(_σ)': 1,
 'char_bigram(αι)': 1,
 'char_bigram(αρ)': 1,
 'char_bigram(αυ)': 1,
 'char_bigram(ει)': 1,
 'char_bigram(η_)': 3,
 'char_bigram(ι_)': 1,
 'char_bigram(ιν)': 1,
 'char_bigram(να)': 1,
 'char_bigram(πα)': 1,
 'char_bigram(ρτ)': 1,
 'char_bigram(σπ)': 1,
 'char_bigram(τη)': 2,
 'char_bigram(υτ)': 1,
 'char_trigram(_αυ)': 1,
 'char_trigram(_ει)': 1,
 'char_trigram(_η_)': 1,
 'char_trigram(_σπ)': 1,
 'char_trigram(αι_)': 1,
 'char_trigram(αρτ)': 1,
 'char_trigram(αυτ)': 1,
 'char_trigram(ειν)': 1,
 'char_trigram(ινα)': 1,
 'char_trigram(ναι)': 1,
 'char_trigram(παρ)': 1,
 'char_trigram(ρτη)': 1,
 'char_trigram(σπα)': 1,
 'char_trigram(τη_)': 2,
 'char_trigram(υτη)': 1,
 'word(αυτη)': 1,
 'word(ειναι)': 1,
 'word(η)': 1,
 'word(σπαρτη)': 1,
 'word_bigra

In [92]:
# from nltk import everygrams

# def sent_process(sent):
#     return [''.join(ng) for ng in everygrams(sent.replace(' ', '_ _'), 1, 4) 
#             if ' ' not in ng and '\n' not in ng and ng != ('_',)]

# sent_process('αυτη ειναι η σπαρτη')

## 4. Creating the training and test sets

In [93]:
import random

all_sents_labeled = ([(sentence, 'CG') for sentence in cg_sents_clean] + [(sentence, 'SMG') for sentence in smg_sents_clean])
random.shuffle(all_sents_labeled)
all_sents_labeled[0]

('αρεσκει μου που λαλεις αρκετα πολυπλοκα πραματα αλλα με απλη γλωσσαν', 'CG')

In [94]:
NO_ALL_SENTENCES = len(all_sents_labeled)
NO_TRAIN_SENTENCES = round(NO_ALL_SENTENCES * .8)

train_set = all_sents_labeled[:NO_TRAIN_SENTENCES]
test_set = all_sents_labeled[NO_TRAIN_SENTENCES:]

train_set_sents = [sent[0] for sent in train_set]
train_set_labels = [sent[1] for sent in train_set]
test_set_sents = [sent[0] for sent in test_set]
test_set_labels = [sent[1] for sent in test_set]

print(train_set_sents[0], train_set_labels[0])

αρεσκει μου που λαλεις αρκετα πολυπλοκα πραματα αλλα με απλη γλωσσαν CG


In [95]:
print('DATASET\t', 'SENTENCES')
print('All\t', NO_ALL_SENTENCES)
print('Training', NO_TRAIN_SENTENCES)
print('Testing\t', NO_ALL_SENTENCES - NO_TRAIN_SENTENCES)

DATASET	 SENTENCES
All	 1039
Training 831
Testing	 208


## 5. Vectorization

In [96]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=get_ngram_features)

train_set_vectors = count_vect.fit_transform(train_set_sents)
test_set_vectors = count_vect.transform(test_set_sents) # Unlike fit_transform(), transform() does not change the count vectorizer's vocabulary so it should be used for the test set.
train_set_vectors

<831x16170 sparse matrix of type '<class 'numpy.int64'>'
	with 125971 stored elements in Compressed Sparse Row format>

In [97]:
from numpy import set_printoptions, nan
set_printoptions(threshold=nan) # Prints whole array. Required because by default an array with thousands of elements wouldn't be printed in full.

train_set_vectors.toarray()[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [98]:
count_vect.vocabulary_ # The numbers are not counts but indices.

{'word(αρεσκει)': 3828,
 'word(μου)': 5668,
 'word(που)': 6293,
 'word(λαλεις)': 5412,
 'word(αρκετα)': 3840,
 'word(πολυπλοκα)': 6264,
 'word(πραματα)': 6317,
 'word(αλλα)': 3616,
 'word(με)': 5559,
 'word(απλη)': 3769,
 'word(γλωσσαν)': 4116,
 'word_bigram(αρεσκει μου)': 7774,
 'word_bigram(μου που)': 11762,
 'word_bigram(που λαλεις)': 13422,
 'word_bigram(λαλεις αρκετα)': 10995,
 'word_bigram(αρκετα πολυπλοκα)': 7789,
 'word_bigram(πολυπλοκα πραματα)': 13261,
 'word_bigram(πραματα αλλα)': 13533,
 'word_bigram(αλλα με)': 7391,
 'word_bigram(με απλη)': 11299,
 'word_bigram(απλη γλωσσαν)': 7678,
 'char(α)': 8,
 'char(ρ)': 24,
 'char(ε)': 12,
 'char(σ)': 25,
 'char(κ)': 17,
 'char(ι)': 16,
 'char(μ)': 19,
 'char(ο)': 22,
 'char(υ)': 27,
 'char(π)': 23,
 'char(λ)': 18,
 'char(τ)': 26,
 'char(η)': 14,
 'char(γ)': 10,
 'char(ω)': 31,
 'char(ν)': 20,
 'char_bigram(_α)': 36,
 'char_bigram(αρ)': 88,
 'char_bigram(ρε)': 338,
 'char_bigram(εσ)': 150,
 'char_bigram(σκ)': 363,
 'char_bigram(κε)':

In [99]:
len(count_vect.vocabulary_) # This is the same as the length of each vector.

16170

## 6. Building the classifiers

In [100]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def show_confusion_matrix(cm):
    print('\t         Predicted')
    print('\t        CG       SMG')
    print('\t     -------- --------')
    print('\tCG  | {:^6} | {:^6}'.format(cm[0][0], cm[1][0]))
    print('Actual\t     -------- --------')
    print('\tSMG | {:^6} | {:^6}'.format(cm[1][0], cm[1][1]))

def show_most_informative_features(vectorizer, clf, n=10):
    print("\t\t    CG\t\t\t\t\t\t    SMG\n")
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%17s\t\t\t%.4f\t%17s" % (coef_1, fn_1, coef_2, fn_2))

### 6.1 Multinomial Naive Bayes classifier

In [101]:
clf_multinomialNB = MultinomialNB() # There are no params for MultinomialDB that prevent overfitting, so any overfitting is caused by the small dataset size.
clf_multinomialNB.fit(train_set_vectors, train_set_labels) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [102]:
clf_multinomialNB_predictions = clf_multinomialNB.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_multinomialNB_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_multinomialNB_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_multinomialNB_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.96 

             precision    recall  f1-score   support

         CG       0.97      0.95      0.96       108
        SMG       0.95      0.97      0.96       100

avg / total       0.96      0.96      0.96       208

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  103   |   3   
Actual	     -------- --------
	SMG |   3    |   97  


In [103]:
show_most_informative_features(count_vect, clf_multinomialNB, n=20)

		    CG						    SMG

	-11.2933	  char_bigram(αα)			-5.3297	          char(α)
	-11.2933	  char_bigram(αο)			-5.3375	          char(ι)
	-11.2933	  char_bigram(β_)			-5.3401	          char(ο)
	-11.2933	  char_bigram(δ_)			-5.3427	          char(ε)
	-11.2933	  char_bigram(δκ)			-5.3558	          char(τ)
	-11.2933	  char_bigram(ηα)			-5.3611	          char(ν)
	-11.2933	  char_bigram(ηβ)			-5.3717	          char(σ)
	-11.2933	  char_bigram(ηε)			-5.4100	  char_bigram(α_)
	-11.2933	  char_bigram(ηο)			-5.4268	          char(κ)
	-11.2933	  char_bigram(ηυ)			-5.4297	          char(ρ)
	-11.2933	  char_bigram(θθ)			-5.4382	          char(υ)
	-11.2933	  char_bigram(θκ)			-5.4527	          char(μ)
	-11.2933	  char_bigram(ιυ)			-5.4644	          char(π)
	-11.2933	  char_bigram(νχ)			-5.4822	  char_bigram(ι_)
	-11.2933	  char_bigram(οα)			-5.4912	          char(η)
	-11.2933	  char_bigram(οζ)			-5.5126	          char(λ)
	-11.2933	  char_bigram(πδ)			-5.5312	  char_bigram(_τ)
	-11.2933	  char_bigram(π

### 6.2 Linear Support Vector classifier

In [104]:
clf_linearSVC = LinearSVC(max_iter=1500) # n_samples < n_features in training set so the dual param is kept at its default value of True. Default max_iter = 1000
clf_linearSVC.fit(train_set_vectors, train_set_labels) 

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1500,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [105]:
clf_linearSVC_predictions = clf_linearSVC.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_linearSVC_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_linearSVC_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_linearSVC_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.9 

             precision    recall  f1-score   support

         CG       0.95      0.86      0.90       108
        SMG       0.86      0.95      0.90       100

avg / total       0.91      0.90      0.90       208

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |   93   |   5   
Actual	     -------- --------
	SMG |   5    |   95  


In [106]:
show_most_informative_features(count_vect, clf_linearSVC, n=20)

		    CG						    SMG

	-0.3405	char_trigram(εν_)			0.1580	        word(δεν)
	-0.3014	         word(εν)			0.1580	char_trigram(δεν)
	-0.2532	  char_bigram(τζ)			0.1455	        word(και)
	-0.2526	char_trigram(_τζ)			0.1275	          char(ι)
	-0.1977	char_trigram(καμ)			0.1262	  char_bigram(_ξ)
	-0.1921	  char_bigram(αμ)			0.1202	char_trigram(σαι)
	-0.1842	  char_bigram(_ε)			0.1198	char_trigram(με_)
	-0.1807	char_trigram(_εν)			0.1188	  char_bigram(_β)
	-0.1671	  char_bigram(θκ)			0.1147	char_trigram(_ξε)
	-0.1529	char_trigram(τι_)			0.1146	char_trigram(_ολ)
	-0.1456	  char_bigram(φκ)			0.1126	char_trigram(ινα)
	-0.1413	char_trigram(του)			0.1118	  char_bigram(λο)
	-0.1328	char_trigram(_λα)			0.1096	char_trigram(ολο)
	-0.1267	char_trigram(τζι)			0.1090	  char_bigram(ικ)
	-0.1227	char_trigram(_εσ)			0.1080	  char_bigram(η_)
	-0.1227	char_trigram(θκι)			0.1060	  char_bigram(στ)
	-0.1224	char_trigram(λαλ)			0.1044	          char(ξ)
	-0.1213	  char_bigram(κο)			0.1044	          char(ν)
	-0.1

### 6.3 Logistic Regression classifier

In [107]:
clf_logisticRegression = LogisticRegression() # Again, dual = True. Default solver = 'liblinear'. It's recommended for smaller databases. For bigger databases, 'saga' could be used.
clf_logisticRegression.fit(train_set_vectors, train_set_labels) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [108]:
clf_logisticRegression_predictions = clf_logisticRegression.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_logisticRegression_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_logisticRegression_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_logisticRegression_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.92 

             precision    recall  f1-score   support

         CG       0.96      0.89      0.92       108
        SMG       0.89      0.96      0.92       100

avg / total       0.93      0.92      0.92       208

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |   96   |   4   
Actual	     -------- --------
	SMG |   4    |   96  


In [109]:
show_most_informative_features(count_vect, clf_logisticRegression, n=20)

		    CG						    SMG

	-1.2418	char_trigram(εν_)			0.6104	        word(και)
	-1.1536	         word(εν)			0.5329	char_trigram(δεν)
	-0.9735	  char_bigram(τζ)			0.5308	        word(δεν)
	-0.9216	char_trigram(_τζ)			0.4690	char_trigram(και)
	-0.7476	char_trigram(_εν)			0.4370	  char_bigram(_α)
	-0.7312	  char_bigram(αμ)			0.4269	  char_bigram(στ)
	-0.6460	  char_bigram(_ε)			0.4215	          char(δ)
	-0.6324	char_trigram(καμ)			0.4158	char_trigram(ινα)
	-0.6103	  char_bigram(θκ)			0.3878	  char_bigram(ικ)
	-0.5348	char_trigram(τζι)			0.3740	  char_bigram(_β)
	-0.5227	  char_bigram(φκ)			0.3584	  char_bigram(_ξ)
	-0.4877	  char_bigram(ζι)			0.3435	char_trigram(_ολ)
	-0.4753	char_trigram(θκι)			0.3397	char_trigram(με_)
	-0.4476	char_trigram(τι_)			0.3329	char_trigram(_ξε)
	-0.4426	  char_bigram(σι)			0.3303	  char_bigram(γα)
	-0.4396	char_trigram(_λα)			0.3250	          char(ξ)
	-0.4389	char_trigram(του)			0.3237	char_trigram(_δε)
	-0.4302	char_trigram(αν_)			0.3234	  char_bigram(λυ)
	-0.4

**It seems that the classification algorithm with the best performance is *Multinomial Naive Bayes***.

## 7. Analyzing misclassifications made by the Multinomial Naive Bayes classifier

In [112]:
print('MISCLASSIFICATIONS\n')

misclassificationCount = 0

for i, sent in enumerate(test_set_sents):
    if test_set_labels[i] != clf_multinomialNB_predictions[i]:
        misclassificationCount += 1
        print(f'{misclassificationCount}.', sent, f'(CORRECT = {test_set_labels[i]},', f'PREDICTED = {clf_multinomialNB_predictions[i]})\n')

MISCLASSIFICATIONS

1. ισως γιατι δεν θυμομαστε καλα (CORRECT = SMG, PREDICTED = CG)

2. πολλα φοουμαι πως το ακελ κατερριψε και αυτο (CORRECT = CG, PREDICTED = SMG)

3. εκτος που τον ταταν του δεν τον υποστηριζουν καν οι φιλελευθεροι (CORRECT = CG, PREDICTED = SMG)

4. φιλε μου σου ζητω ταπεινα συγγνωμη (CORRECT = SMG, PREDICTED = CG)

5. μπορουμε να ακουμε μουσικη την ωρα που δουλευουμε (CORRECT = SMG, PREDICTED = CG)

6. εν καθηγητης ο πελλος (CORRECT = CG, PREDICTED = SMG)

7. κατ εμεναν η απαντηση στο πρωτον ερωτημαν εν προφανως ναι (CORRECT = CG, PREDICTED = SMG)

8. επαρχια λεμεσου κ εχτες και σημερα ηβραμε (CORRECT = CG, PREDICTED = SMG)



## 8. Trying the Multinomial Naive Bayes classifier with custom input

In [113]:
cgSent = 'Η Κύπρος εν που τες πιο όμορφες χώρες.'
smgSent = 'Η Κύπρος είναι από τις πιο όμορφες χώρες.'

cgSent = get_clean_sent_el(cgSent)
smgSent = get_clean_sent_el(smgSent)

test_vec = count_vect.transform([cgSent, smgSent])
clf_multinomialNB.predict(test_vec)

array(['CG', 'SMG'], dtype='<U3')