# Building the Classifier

## 1. Loading the corpus

In [1]:
import re
from nltk import sent_tokenize

cg_sents = []
smg_sents = []

def remove_duplicate_punctuation(s):
    return(re.sub(r'(\.|\?|!|;)\1+', r'\1 ', s))

with open('./Data/cg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])(\.|\?|;|!)([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])(\.|\?|;|!)([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])(\.|\?|;|!)([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)

with open('./Data/smg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])(\.|\?|;|!)([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])(\.|\?|;|!)([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])(\.|\?|;|!)([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)

cg_sents[:3]

['Πρασινο αυκουι μες το πασχαλινο ποτήρι που έπιασε ο μιτσης #αισχος 🤣🤣🤣   @HARRIS_APOEL https://t.co/y9X7CmBEa5',
 '@HARRIS_APOEL @pirpoitis @vassrules Καμνουν πολλα ανακαινιση στα Περβολια .',
 '@MUFCChristian Ελα συγγενη τζιαι εχουμε νεοτερα π το Νικολη.']

## 2. Cleaning the text

In [2]:
import unicodedata
from string import punctuation
from nltk.tokenize import WhitespaceTokenizer

punctuation += '´΄’…“”–—―»«'

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def get_clean_sent_el(sentence):
    sentence = re.sub(r'^RT', '', sentence)
    sentence = re.sub(r'\&\w*;', '', sentence)
    sentence = re.sub(r'\@\w*', '', sentence)
    sentence = re.sub(r'\$\w*', '', sentence)
    sentence = re.sub(r'https?:\/\/.*\/\w*', '', sentence)
    sentence = ''.join(c for c in sentence if c <= '\uFFFF')
    sentence = strip_accents(sentence)
    sentence = re.sub(r'#\w*', '', sentence)
    sentence = sentence.lower()
    tokens = WhitespaceTokenizer().tokenize(sentence)
    new_tokens = []
    for token in tokens:
        if token == 'ο,τι' or token == 'ό,τι' or token == 'o,ti' or token == 'ó,ti':
            new_tokens.append(token)
        else:
            token = re.sub(r'(?<=[.,!\?;\'΄´])(?=[^\s])', r' ', token)
            new_token = token.translate(str.maketrans({key: None for key in punctuation}))
            if (new_token != ''):
                new_tokens.append(new_token)
    sentence =' '.join(new_tokens)
    sentence = re.sub('\ufeff', '', sentence)
    sentence = sentence.strip(' ')
    sentence = re.sub('  ', ' ', sentence)
    return sentence

cg_sents_clean = []
smg_sents_clean = []

for sent in cg_sents:
    cg_sents_clean.append(get_clean_sent_el(sent))
for sent in smg_sents:
    smg_sents_clean.append(get_clean_sent_el(sent))

cg_sents_clean = list(filter(None, cg_sents_clean))
smg_sents_clean = list(filter(None, smg_sents_clean))
cg_sents_clean[:3]

['πρασινο αυκουι μες το πασχαλινο ποτηρι που επιασε ο μιτσης',
 'καμνουν πολλα ανακαινιση στα περβολια',
 'ελα συγγενη τζιαι εχουμε νεοτερα π το νικολη']

## 3. Building the feature extractor

In [3]:
from nltk import ngrams

def get_word_ngrams(tokens, n):
    ngrams_list = []
    ngrams_list.append(list(ngrams(tokens, n)))
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = '%s'
    for i in range(1, n):
        format_string += (' %s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

def get_char_ngrams(word, n):
    ngrams_list = []
    word = re.sub(r'ς', 'σ', word)
    ngrams_list.append(list(ngrams(word, n, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_')))
    
    # Removing redundant ngrams:
    if (n > 2):
        redundant_combinations = n - 2
        ngrams_list = [ngram_list[redundant_combinations : -redundant_combinations] for ngram_list in ngrams_list]
    
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = ''
    for i in range(0, n):
        format_string += ('%s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

In [4]:
# Feature extractor
def get_ngram_features(sent): # The reason I do not use NLTK's everygrams to extract the features quickly is because the behavior of my n-gram extractor is modified to remove redundant n-grams. Also, I need to label word and char n-grams to avoid ambiguity
    sentence_tokens = WhitespaceTokenizer().tokenize(sent)
    
    features = {}
    
    # Word unigrams
    ngrams = get_word_ngrams(sentence_tokens, 1)
    for ngram in ngrams:
        features[f'word({ngram})'] = features.get(f'word({ngram})', 0) + 1 # The second parameter to .get() is a default value if the key doesn't exist.
    
    # Word bigrams
    ngrams = get_word_ngrams(sentence_tokens, 2)
    for ngram in ngrams:
        features[f'word_bigram({ngram})'] = features.get(f'word_bigram({ngram})', 0) + 1
    
    # Char unigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 1)
        for ngram in ngrams:
            features[f'char({ngram})'] = features.get(f'char({ngram})', 0) + 1
    
    # Char bigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 2)
        for ngram in ngrams:
            features[f'char_bigram({ngram})'] = features.get(f'char_bigram({ngram})', 0) + 1
    
    # Char trigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 3)
        for ngram in ngrams:
            features[f'char_trigram({ngram})'] = features.get(f'char_trigram({ngram})', 0) + 1
    
    return features

get_ngram_features('αυτη ειναι η σπαρτη')

{'char(α)': 3,
 'char(ε)': 1,
 'char(η)': 3,
 'char(ι)': 2,
 'char(ν)': 1,
 'char(π)': 1,
 'char(ρ)': 1,
 'char(σ)': 1,
 'char(τ)': 2,
 'char(υ)': 1,
 'char_bigram(_α)': 1,
 'char_bigram(_ε)': 1,
 'char_bigram(_η)': 1,
 'char_bigram(_σ)': 1,
 'char_bigram(αι)': 1,
 'char_bigram(αρ)': 1,
 'char_bigram(αυ)': 1,
 'char_bigram(ει)': 1,
 'char_bigram(η_)': 3,
 'char_bigram(ι_)': 1,
 'char_bigram(ιν)': 1,
 'char_bigram(να)': 1,
 'char_bigram(πα)': 1,
 'char_bigram(ρτ)': 1,
 'char_bigram(σπ)': 1,
 'char_bigram(τη)': 2,
 'char_bigram(υτ)': 1,
 'char_trigram(_αυ)': 1,
 'char_trigram(_ει)': 1,
 'char_trigram(_η_)': 1,
 'char_trigram(_σπ)': 1,
 'char_trigram(αι_)': 1,
 'char_trigram(αρτ)': 1,
 'char_trigram(αυτ)': 1,
 'char_trigram(ειν)': 1,
 'char_trigram(ινα)': 1,
 'char_trigram(ναι)': 1,
 'char_trigram(παρ)': 1,
 'char_trigram(ρτη)': 1,
 'char_trigram(σπα)': 1,
 'char_trigram(τη_)': 2,
 'char_trigram(υτη)': 1,
 'word(αυτη)': 1,
 'word(ειναι)': 1,
 'word(η)': 1,
 'word(σπαρτη)': 1,
 'word_bigra

In [5]:
# from nltk import everygrams

# def sent_process(sent):
#     return [''.join(ng) for ng in everygrams(sent.replace(' ', '_ _'), 1, 4) 
#             if ' ' not in ng and '\n' not in ng and ng != ('_',)]

# sent_process('αυτη ειναι η σπαρτη')

## 4. Creating the training and test sets

In [6]:
import random

all_sents_labeled = ([(sentence, 'CG') for sentence in cg_sents_clean] + [(sentence, 'SMG') for sentence in smg_sents_clean])
random.shuffle(all_sents_labeled)
all_sents_labeled[0]

('μια απο τις πιο κοινες δικαιολογιες που οδηγει σε μεγαλες συμφορες ειναι τρεις λεξεις',
 'SMG')

In [7]:
NO_ALL_SENTENCES = len(all_sents_labeled)
NO_TRAIN_SENTENCES = round(NO_ALL_SENTENCES * .8)

train_set = all_sents_labeled[:NO_TRAIN_SENTENCES]
test_set = all_sents_labeled[NO_TRAIN_SENTENCES:]

train_set_sents = [sent[0] for sent in train_set]
train_set_labels = [sent[1] for sent in train_set]
test_set_sents = [sent[0] for sent in test_set]
test_set_labels = [sent[1] for sent in test_set]

print(train_set_sents[0], train_set_labels[0])

μια απο τις πιο κοινες δικαιολογιες που οδηγει σε μεγαλες συμφορες ειναι τρεις λεξεις SMG


In [8]:
print('DATASET\t', 'SENTENCES')
print('All\t', NO_ALL_SENTENCES)
print('Training', NO_TRAIN_SENTENCES)
print('Testing\t', NO_ALL_SENTENCES - NO_TRAIN_SENTENCES)

DATASET	 SENTENCES
All	 1039
Training 831
Testing	 208


## 5. Vectorization

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=get_ngram_features)

train_set_vectors = count_vect.fit_transform(train_set_sents)
test_set_vectors = count_vect.transform(test_set_sents) # Unlike fit_transform(), transform() does not change the count vectorizer's vocabulary so it should be used for the test set.
train_set_vectors

<831x16241 sparse matrix of type '<class 'numpy.int64'>'
	with 126203 stored elements in Compressed Sparse Row format>

In [10]:
from numpy import set_printoptions, nan
set_printoptions(threshold=nan) # Prints whole array. Required because by default an array with thousands of elements wouldn't be printed in full.

train_set_vectors.toarray()[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,

In [11]:
count_vect.vocabulary_ # The numbers are not counts but indices.

{'word(μια)': 5665,
 'word(απο)': 3786,
 'word(τις)': 6864,
 'word(πιο)': 6230,
 'word(κοινες)': 5323,
 'word(δικαιολογιες)': 4289,
 'word(που)': 6333,
 'word(οδηγει)': 5890,
 'word(σε)': 6504,
 'word(μεγαλες)': 5607,
 'word(συμφορες)': 6701,
 'word(ειναι)': 4452,
 'word(τρεις)': 6919,
 'word(λεξεις)': 5486,
 'word_bigram(μια απο)': 11608,
 'word_bigram(απο τις)': 7730,
 'word_bigram(τις πιο)': 15143,
 'word_bigram(πιο κοινες)': 13160,
 'word_bigram(κοινες δικαιολογιες)': 10893,
 'word_bigram(δικαιολογιες που)': 8533,
 'word_bigram(που οδηγει)': 13527,
 'word_bigram(οδηγει σε)': 12448,
 'word_bigram(σε μεγαλες)': 13942,
 'word_bigram(μεγαλες συμφορες)': 11453,
 'word_bigram(συμφορες ειναι)': 14410,
 'word_bigram(ειναι τρεις)': 8873,
 'word_bigram(τρεις λεξεις)': 15759,
 'char(μ)': 19,
 'char(ι)': 16,
 'char(α)': 8,
 'char(π)': 23,
 'char(ο)': 22,
 'char(τ)': 26,
 'char(σ)': 25,
 'char(κ)': 17,
 'char(ν)': 20,
 'char(ε)': 12,
 'char(δ)': 11,
 'char(λ)': 18,
 'char(γ)': 10,
 'char(υ)': 2

In [12]:
len(count_vect.vocabulary_) # This is the same as the length of each vector.

16241

## 6. Building the classifiers

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def show_confusion_matrix(cm):
    print('\t         Predicted')
    print('\t        CG       SMG')
    print('\t     -------- --------')
    print('\tCG  | {:^6} | {:^6}'.format(cm[0][0], cm[0][1]))
    print('Actual\t     -------- --------')
    print('\tSMG | {:^6} | {:^6}'.format(cm[1][0], cm[1][1]))

def show_most_informative_features(vectorizer, clf, n=10):
    print("\t\t    CG\t\t\t\t\t\t    SMG\n")
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%17s\t\t\t%.4f\t%17s" % (coef_1, fn_1, coef_2, fn_2))

### 6.1 Multinomial Naive Bayes classifier

In [14]:
clf_multinomialNB = MultinomialNB() # There are no params for MultinomialDB that prevent overfitting, so any overfitting is caused by the small dataset size.
clf_multinomialNB.fit(train_set_vectors, train_set_labels) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
clf_multinomialNB_predictions = clf_multinomialNB.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_multinomialNB_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_multinomialNB_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_multinomialNB_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.97 

             precision    recall  f1-score   support

         CG       0.99      0.96      0.97       113
        SMG       0.95      0.99      0.97        95

avg / total       0.97      0.97      0.97       208

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  108   |   5   
Actual	     -------- --------
	SMG |   1    |   94  


In [16]:
show_most_informative_features(count_vect, clf_multinomialNB, n=20)

		    CG						    SMG

	-11.3058	  char_bigram(αα)			-5.3294	          char(α)
	-11.3058	  char_bigram(β_)			-5.3371	          char(ι)
	-11.3058	  char_bigram(δ_)			-5.3396	          char(ο)
	-11.3058	  char_bigram(δκ)			-5.3474	          char(ε)
	-11.3058	  char_bigram(ηα)			-5.3578	          char(τ)
	-11.3058	  char_bigram(ηβ)			-5.3604	          char(σ)
	-11.3058	  char_bigram(ηε)			-5.3656	          char(ν)
	-11.3058	  char_bigram(ηο)			-5.4059	  char_bigram(α_)
	-11.3058	  char_bigram(ηυ)			-5.4309	          char(υ)
	-11.3058	  char_bigram(θθ)			-5.4337	          char(κ)
	-11.3058	  char_bigram(θκ)			-5.4365	          char(ρ)
	-11.3058	  char_bigram(θν)			-5.4450	          char(μ)
	-11.3058	  char_bigram(νχ)			-5.4710	          char(π)
	-11.3058	  char_bigram(οζ)			-5.4739	          char(η)
	-11.3058	  char_bigram(πδ)			-5.4857	  char_bigram(ι_)
	-11.3058	  char_bigram(πκ)			-5.4946	          char(λ)
	-11.3058	  char_bigram(ππ)			-5.5250	  char_bigram(_τ)
	-11.3058	  char_bigram(τ

### 6.2 Linear Support Vector classifier

In [17]:
clf_linearSVC = LinearSVC(max_iter=1500) # n_samples < n_features in training set so the dual param is kept at its default value of True. Default max_iter = 1000
clf_linearSVC.fit(train_set_vectors, train_set_labels) 

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1500,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [18]:
clf_linearSVC_predictions = clf_linearSVC.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_linearSVC_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_linearSVC_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_linearSVC_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.94 

             precision    recall  f1-score   support

         CG       0.96      0.92      0.94       113
        SMG       0.91      0.96      0.93        95

avg / total       0.94      0.94      0.94       208

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  104   |   9   
Actual	     -------- --------
	SMG |   4    |   91  


In [19]:
show_most_informative_features(count_vect, clf_linearSVC, n=20)

		    CG						    SMG

	-0.3235	char_trigram(εν_)			0.1709	  char_bigram(_β)
	-0.3091	         word(εν)			0.1686	        word(και)
	-0.2636	char_trigram(_τζ)			0.1492	        word(δεν)
	-0.2424	  char_bigram(τζ)			0.1492	char_trigram(δεν)
	-0.2183	  char_bigram(αμ)			0.1471	          char(λ)
	-0.1940	  char_bigram(θκ)			0.1340	  char_bigram(ικ)
	-0.1782	char_trigram(καμ)			0.1334	          char(ι)
	-0.1665	  char_bigram(λλ)			0.1187	char_trigram(_ολ)
	-0.1660	  char_bigram(_ε)			0.1163	  char_bigram(η_)
	-0.1648	char_trigram(θκι)			0.1161	  char_bigram(_ξ)
	-0.1584	          char(π)			0.1147	  char_bigram(τι)
	-0.1536	char_trigram(_ου)			0.1135	char_trigram(τασ)
	-0.1469	  char_bigram(σ_)			0.1127	  char_bigram(λε)
	-0.1457	  char_bigram(φκ)			0.1086	  char_bigram(λυ)
	-0.1436	char_trigram(σιε)			0.1081	  char_bigram(ολ)
	-0.1407	char_trigram(τι_)			0.1078	char_trigram(ινα)
	-0.1342	char_trigram(ιαι)			0.1033	  char_bigram(ο_)
	-0.1284	char_trigram(_εν)			0.1006	char_trigram(_ξε)
	-0.1

### 6.3 Logistic Regression classifier

In [20]:
clf_logisticRegression = LogisticRegression() # Again, dual = True. Default solver = 'liblinear'. It's recommended for smaller databases. For bigger databases, 'saga' could be used.
clf_logisticRegression.fit(train_set_vectors, train_set_labels) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
clf_logisticRegression_predictions = clf_logisticRegression.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_logisticRegression_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_logisticRegression_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_logisticRegression_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.94 

             precision    recall  f1-score   support

         CG       0.96      0.93      0.95       113
        SMG       0.92      0.96      0.94        95

avg / total       0.94      0.94      0.94       208

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  105   |   8   
Actual	     -------- --------
	SMG |   4    |   91  


In [22]:
show_most_informative_features(count_vect, clf_logisticRegression, n=20)

		    CG						    SMG

	-1.2187	char_trigram(εν_)			0.7035	        word(και)
	-1.1447	         word(εν)			0.5346	  char_bigram(_β)
	-0.9638	char_trigram(_τζ)			0.5117	char_trigram(δεν)
	-0.9328	  char_bigram(τζ)			0.5087	        word(δεν)
	-0.8040	  char_bigram(αμ)			0.4960	char_trigram(και)
	-0.6861	  char_bigram(θκ)			0.4360	  char_bigram(ικ)
	-0.6145	char_trigram(καμ)			0.4219	          char(λ)
	-0.6108	  char_bigram(_ε)			0.4078	  char_bigram(λυ)
	-0.5917	char_trigram(_εν)			0.4021	          char(δ)
	-0.5797	char_trigram(θκι)			0.3775	  char_bigram(η_)
	-0.5575	  char_bigram(λλ)			0.3762	char_trigram(ινα)
	-0.5103	  char_bigram(φκ)			0.3685	  char_bigram(να)
	-0.5042	char_trigram(τζι)			0.3667	          char(γ)
	-0.4878	          char(π)			0.3616	char_trigram(τασ)
	-0.4704	char_trigram(σιε)			0.3585	          char(χ)
	-0.4624	char_trigram(ιαι)			0.3582	char_trigram(_ολ)
	-0.4607	char_trigram(τι_)			0.3574	  char_bigram(ολ)
	-0.4538	char_trigram(_ου)			0.3369	  char_bigram(_ξ)
	-0.4

**It seems that the classification algorithm with the best performance is *Multinomial Naive Bayes***.

## 7. Analyzing misclassifications made by the Multinomial Naive Bayes classifier

In [23]:
print('MISCLASSIFICATIONS\n')

misclassificationCount = 0

for i, sent in enumerate(test_set_sents):
    if test_set_labels[i] != clf_multinomialNB_predictions[i]:
        misclassificationCount += 1
        print(f'{misclassificationCount}.', sent, f'(CORRECT = {test_set_labels[i]},', f'PREDICTED = {clf_multinomialNB_predictions[i]})\n')

MISCLASSIFICATIONS

1. μα ελογαριαζαν χωρις τον ξενοδοχο ενα εχω να τους πω τα λεφτα της αδειας και για την κοροιδια και μονο εν να την πληρωσουν ακριβα γιατι ο καθενας εν να βλεπει το συμφερον του τωρα και να παει (CORRECT = CG, PREDICTED = SMG)

2. πολλα πραματα που σημερα θεωρουμεν δεδομενα πιθανον να μην ειναι πολλα συντομα (CORRECT = CG, PREDICTED = SMG)

3. ειμαι σιουρος οτι αν δεν ενοχλησω ατομα με τα μηνυματα περι θεων αγγελων και τυχης ο θεος εν θα με τιμωρησει ουτε εννα χασω το θαυμα που θα εγινετουν σε αντιθετη περιπτωση (CORRECT = CG, PREDICTED = SMG)

4. και μετα λαλουν για τες γυναικες οτι δινουν μικτα σηματα (CORRECT = CG, PREDICTED = SMG)

5. να τα γαμας ολα για το κερδος αλλα να το παιζεις αριστος (CORRECT = SMG, PREDICTED = CG)

6. αλλες χρονιες εβρισκες ολοχρονα εκυνηγουσαν εν ηταν τοσο το δηλητηριο (CORRECT = CG, PREDICTED = SMG)



## 8. Trying the Multinomial Naive Bayes classifier with custom input

First, a more powerful version of the classifier is built by using all the data available:

In [24]:
full_set_sents = [sent[0] for sent in all_sents_labeled]
full_set_labels = [sent[1] for sent in all_sents_labeled]
full_set_vectors = count_vect.fit_transform(full_set_sents)

clf_super_multinomialNB = MultinomialNB()
clf_super_multinomialNB.fit(full_set_vectors, full_set_labels) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Trying 2 custom sentences:

In [25]:
cgSent = 'Η Κύπρος εν που τες πιο όμορφες χώρες.'
smgSent = 'Η Κύπρος είναι από τις πιο όμορφες χώρες.'

cgSent = get_clean_sent_el(cgSent)
smgSent = get_clean_sent_el(smgSent)

test_vec = count_vect.transform([cgSent, smgSent])
clf_super_multinomialNB.predict(test_vec)

array(['CG', 'SMG'], dtype='<U3')