# Building the Classifier

## 1. Loading the corpus

In [87]:
import re
from nltk import sent_tokenize

cg_sents = []
smg_sents = []

def remove_duplicate_punctuation(s):
    return(re.sub(r'(\.|\?|!|;)\1+', r'\1 ', s))

with open('./Data/cg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)

with open('./Data/smg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)

cg_sents[:3]

['Πρασινο αυκουι μες το πασχαλινο ποτήρι που έπιασε ο μιτσης #αισχος 🤣🤣🤣   @HARRIS_APOEL https://t.co/y9X7CmBEa5',
 '@HARRIS_APOEL @pirpoitis @vassrules Καμνουν πολλα ανακαινιση στα Περβολια .',
 '@MUFCChristian Ελα συγγενη τζιαι εχουμε νεοτερα π το Νικολη.']

## 2. Cleaning the text

In [113]:
import unicodedata
from string import punctuation
from nltk.tokenize import WhitespaceTokenizer

punctuation += '΄…“”–—―'

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def get_clean_sent_el(sentence):
    sentence = re.sub(r'^RT', '', sentence)
    sentence = re.sub(r'\&\w*;', '', sentence)
    sentence = re.sub(r'\@\w*', '', sentence)
    sentence = re.sub(r'\$\w*', '', sentence)
    sentence = re.sub(r'https?:\/\/.*\/\w*', '', sentence)
    sentence = ''.join(c for c in sentence if c <= '\uFFFF')
    sentence = strip_accents(sentence)
    sentence = re.sub(r'#\w*', '', sentence)
    sentence = sentence.lower()
    tokens = WhitespaceTokenizer().tokenize(sentence)
    new_tokens = []
    for token in tokens:
        if token == 'ο,τι' or token == 'ό,τι' or token == 'o,ti' or token == 'ó,ti':
            new_tokens.append(token.replace(' ', ''))
        else:
            token = re.sub(r'(?<=[.,!\?;\'΄])(?=[^\s])', r' ', token)
            new_token = token.translate(str.maketrans({key: None for key in punctuation}))
            if (new_token != ''):
                new_tokens.append(new_token)
    sentence =' '.join(new_tokens)
    sentence = sentence.strip(' ')
    return sentence

cg_sents_clean = []
smg_sents_clean = []

for sent in cg_sents:
    cg_sents_clean.append(get_clean_sent_el(sent))
for sent in smg_sents:
    smg_sents_clean.append(get_clean_sent_el(sent))

cg_sents_clean = list(filter(None, cg_sents_clean))
smg_sents_clean = list(filter(None, smg_sents_clean))
cg_sents_clean[:3]

['πρασινο αυκουι μες το πασχαλινο ποτηρι που επιασε ο μιτσης',
 'καμνουν πολλα ανακαινιση στα περβολια',
 'ελα συγγενη τζιαι εχουμε νεοτερα π το νικολη']

## 3. Building the feature extractor

In [90]:
from nltk import ngrams

def get_word_ngrams(tokens, n):
    ngrams_list = []
    ngrams_list.append(list(ngrams(tokens, n)))
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = '%s'
    for i in range(1, n):
        format_string += (' %s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

def get_char_ngrams(word, n):
    ngrams_list = []
    word = re.sub(r'ς', 'σ', word)
    ngrams_list.append(list(ngrams(word, n, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_')))
    
    # Removing redundant ngrams:
    if (n > 2):
        redundant_combinations = n - 2
        ngrams_list = [ngram_list[redundant_combinations : -redundant_combinations] for ngram_list in ngrams_list]
    
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = ''
    for i in range(0, n):
        format_string += ('%s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

In [91]:
# Feature extractor
def get_ngram_features(sent): # The reason I do not use NLTK's everygrams to extract the features quickly is because the behavior of my n-gram extractor is modified to remove redundant n-grams. Also, I need to label word and char n-grams to avoid ambiguity
    sentence_tokens = WhitespaceTokenizer().tokenize(sent)
    
    features = {}
    
    # Word unigrams
    ngrams = get_word_ngrams(sentence_tokens, 1)
    for ngram in ngrams:
        features[f'word({ngram})'] = features.get(f'word({ngram})', 0) + 1 # The second parameter to .get() is a default value if the key doesn't exist.
    
    # Word bigrams
    ngrams = get_word_ngrams(sentence_tokens, 2)
    for ngram in ngrams:
        features[f'word_bigram({ngram})'] = features.get(f'word_bigram({ngram})', 0) + 1
    
    # Char unigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 1)
        for ngram in ngrams:
            features[f'char({ngram})'] = features.get(f'char({ngram})', 0) + 1
    
    # Char bigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 2)
        for ngram in ngrams:
            features[f'char_bigram({ngram})'] = features.get(f'char_bigram({ngram})', 0) + 1
    
    # Char trigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 3)
        for ngram in ngrams:
            features[f'char_trigram({ngram})'] = features.get(f'char_trigram({ngram})', 0) + 1
    
    return features

get_ngram_features('αυτη ειναι η σπαρτη')

{'char(α)': 3,
 'char(ε)': 1,
 'char(η)': 3,
 'char(ι)': 2,
 'char(ν)': 1,
 'char(π)': 1,
 'char(ρ)': 1,
 'char(σ)': 1,
 'char(τ)': 2,
 'char(υ)': 1,
 'char_bigram(_α)': 1,
 'char_bigram(_ε)': 1,
 'char_bigram(_η)': 1,
 'char_bigram(_σ)': 1,
 'char_bigram(αι)': 1,
 'char_bigram(αρ)': 1,
 'char_bigram(αυ)': 1,
 'char_bigram(ει)': 1,
 'char_bigram(η_)': 3,
 'char_bigram(ι_)': 1,
 'char_bigram(ιν)': 1,
 'char_bigram(να)': 1,
 'char_bigram(πα)': 1,
 'char_bigram(ρτ)': 1,
 'char_bigram(σπ)': 1,
 'char_bigram(τη)': 2,
 'char_bigram(υτ)': 1,
 'char_trigram(_αυ)': 1,
 'char_trigram(_ει)': 1,
 'char_trigram(_η_)': 1,
 'char_trigram(_σπ)': 1,
 'char_trigram(αι_)': 1,
 'char_trigram(αρτ)': 1,
 'char_trigram(αυτ)': 1,
 'char_trigram(ειν)': 1,
 'char_trigram(ινα)': 1,
 'char_trigram(ναι)': 1,
 'char_trigram(παρ)': 1,
 'char_trigram(ρτη)': 1,
 'char_trigram(σπα)': 1,
 'char_trigram(τη_)': 2,
 'char_trigram(υτη)': 1,
 'word(αυτη)': 1,
 'word(ειναι)': 1,
 'word(η)': 1,
 'word(σπαρτη)': 1,
 'word_bigra

In [92]:
# from nltk import everygrams

# def sent_process(sent):
#     return [''.join(ng) for ng in everygrams(sent.replace(' ', '_ _'), 1, 4) 
#             if ' ' not in ng and '\n' not in ng and ng != ('_',)]

# sent_process('αυτη ειναι η σπαρτη')

## 4. Creating the training and test sets

In [93]:
import random

all_sents_labeled = ([(sentence, 'CG') for sentence in cg_sents_clean] + [(sentence, 'SMG') for sentence in smg_sents_clean])
random.shuffle(all_sents_labeled)
all_sents_labeled[0]

('και δεν θες να το χασεις με τιποτα', 'SMG')

In [94]:
NO_ALL_SENTENCES = len(all_sents_labeled)
NO_TRAIN_SENTENCES = round(NO_ALL_SENTENCES * .8)

train_set = all_sents_labeled[:NO_TRAIN_SENTENCES]
test_set = all_sents_labeled[NO_TRAIN_SENTENCES:]

train_set_sents = [sent[0] for sent in train_set]
train_set_labels = [sent[1] for sent in train_set]
test_set_sents = [sent[0] for sent in test_set]
test_set_labels = [sent[1] for sent in test_set]

print(train_set_sents[0], train_set_labels[0])

και δεν θες να το χασεις με τιποτα SMG


In [95]:
print('DATASET\t', 'SENTENCES')
print('All\t', NO_ALL_SENTENCES)
print('Training', NO_TRAIN_SENTENCES)
print('Testing\t', NO_ALL_SENTENCES - NO_TRAIN_SENTENCES)

DATASET	 SENTENCES
All	 1605
Training 1284
Testing	 321


## 5. Vectorization

In [96]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=get_ngram_features)

train_set_vectors = count_vect.fit_transform(train_set_sents)
test_set_vectors = count_vect.transform(test_set_sents) # Unlike fit_transform(), transform() does not change the count vectorizer's vocabulary so it should be used for the test set.
train_set_vectors

<1284x22024 sparse matrix of type '<class 'numpy.int64'>'
	with 178594 stored elements in Compressed Sparse Row format>

In [97]:
from numpy import set_printoptions, nan
set_printoptions(threshold=nan) # Prints whole array. Required because by default an array with thousands of elements wouldn't be printed in full.

train_set_vectors.toarray()[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [98]:
count_vect.vocabulary_ # The numbers are not counts but indices.

{'word(και)': 7913,
 'word(δεν)': 6828,
 'word(θες)': 7784,
 'word(να)': 8834,
 'word(το)': 10265,
 'word(χασεις)': 10639,
 'word(με)': 8584,
 'word(τιποτα)': 10256,
 'word_bigram(και δεν)': 14696,
 'word_bigram(δεν θες)': 12507,
 'word_bigram(θες να)': 14479,
 'word_bigram(να το)': 16765,
 'word_bigram(το χασεις)': 20953,
 'word_bigram(χασεις με)': 21827,
 'word_bigram(με τιποτα)': 15973,
 'char(κ)': 50,
 'char(α)': 41,
 'char(ι)': 49,
 'char(δ)': 44,
 'char(ε)': 45,
 'char(ν)': 53,
 'char(θ)': 48,
 'char(σ)': 58,
 'char(τ)': 59,
 'char(ο)': 55,
 'char(χ)': 62,
 'char(μ)': 52,
 'char(π)': 56,
 'char_bigram(_κ)': 189,
 'char_bigram(κα)': 711,
 'char_bigram(αι)': 544,
 'char_bigram(ι_)': 681,
 'char_bigram(_δ)': 183,
 'char_bigram(δε)': 593,
 'char_bigram(εν)': 618,
 'char_bigram(ν_)': 767,
 'char_bigram(_θ)': 187,
 'char_bigram(θε)': 669,
 'char_bigram(εσ)': 623,
 'char_bigram(σ_)': 863,
 'char_bigram(_ν)': 192,
 'char_bigram(να)': 769,
 'char_bigram(α_)': 534,
 'char_bigram(_τ)': 198,

In [99]:
len(count_vect.vocabulary_) # This is the same as the length of each vector.

22024

## 6. Building the classifiers

In [100]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def show_confusion_matrix(cm):
    print('\t         Predicted')
    print('\t        CG       SMG')
    print('\t     -------- --------')
    print('\tCG  | {:^6} | {:^6}'.format(cm[0][0], cm[1][0]))
    print('Actual\t     -------- --------')
    print('\tSMG | {:^6} | {:^6}'.format(cm[1][0], cm[1][1]))

def show_most_informative_features(vectorizer, clf, n=10):
    print("\t\t    CG\t\t\t\t\t\t    SMG\n")
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%17s\t\t\t%.4f\t%17s" % (coef_1, fn_1, coef_2, fn_2))

### 6.1 Multinomial Naive Bayes classifier

In [101]:
clf_multinomialNB = MultinomialNB() # There are no params for MultinomialDB that prevent overfitting, so any overfitting is caused by the small dataset size.
clf_multinomialNB.fit(train_set_vectors, train_set_labels) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [102]:
clf_multinomialNB_predictions = clf_multinomialNB.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_multinomialNB_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_multinomialNB_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_multinomialNB_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.94 

             precision    recall  f1-score   support

         CG       0.94      0.90      0.92       121
        SMG       0.94      0.96      0.95       200

avg / total       0.94      0.94      0.94       321

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  109   |   7   
Actual	     -------- --------
	SMG |   7    |  193  


In [103]:
show_most_informative_features(count_vect, clf_multinomialNB, n=20)

		    CG						    SMG

	-11.8585	          char(΄)			-5.1726	          char(α)
	-11.8585	  char_bigram(eμ)			-5.1928	          char(ο)
	-11.8585	  char_bigram(eξ)			-5.1954	          char(ι)
	-11.8585	  char_bigram(yr)			-5.1954	          char(ε)
	-11.8585	  char_bigram(zω)			-5.2018	          char(τ)
	-11.8585	  char_bigram(΄_)			-5.2304	          char(σ)
	-11.8585	  char_bigram(΄ο)			-5.2545	          char(ν)
	-11.8585	  char_bigram(δκ)			-5.2848	  char_bigram(_τ)
	-11.8585	  char_bigram(ζ΄)			-5.3160	          char(ρ)
	-11.8585	  char_bigram(ηα)			-5.3262	  char_bigram(ο_)
	-11.8585	  char_bigram(ηβ)			-5.3349	  char_bigram(το)
	-11.8585	  char_bigram(ηζ)			-5.3379	          char(κ)
	-11.8585	  char_bigram(ηη)			-5.3662	          char(π)
	-11.8585	  char_bigram(ηο)			-5.3769	          char(υ)
	-11.8585	  char_bigram(θθ)			-5.3815	          char(μ)
	-11.8585	  char_bigram(θκ)			-5.3831	          char(η)
	-11.8585	  char_bigram(ι΄)			-5.3892	          char(λ)
	-11.8585	  char_bigram(ι

### 6.2 Linear Support Vector classifier

In [17]:
clf_linearSVC = LinearSVC(max_iter=1500) # n_samples < n_features in training set so the dual param is kept at its default value of True. Default max_iter = 1000
clf_linearSVC.fit(train_set_vectors, train_set_labels) 

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1500,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [18]:
clf_linearSVC_predictions = clf_linearSVC.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_linearSVC_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_linearSVC_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_linearSVC_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.92 

             precision    recall  f1-score   support

         CG       0.95      0.96      0.96       122
        SMG       0.69      0.65      0.67        17

avg / total       0.92      0.92      0.92       139

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  117   |   6   
Actual	     -------- --------
	SMG |   6    |   11  


In [19]:
show_most_informative_features(count_vect, clf_linearSVC, n=20)

		    CG						    SMG

	-0.2330	char_trigram(εν_)			0.1491	  char_bigram(_μ)
	-0.1910	         word(εν)			0.1263	char_trigram(_στ)
	-0.1718	  char_bigram(_ε)			0.1258	  char_bigram(_β)
	-0.1675	  char_bigram(αμ)			0.1208	  char_bigram(στ)
	-0.1668	  char_bigram(σι)			0.1061	  char_bigram(λυ)
	-0.1394	  char_bigram(τζ)			0.1051	        word(ολα)
	-0.1351	char_trigram(_τζ)			0.1050	char_trigram(σαι)
	-0.1336	          char(ο)			0.1027	char_trigram(_λε)
	-0.1258	char_trigram(καμ)			0.0978	          char(δ)
	-0.1165	          char(ε)			0.0962	char_trigram(ιστ)
	-0.1148	  char_bigram(ω_)			0.0936	char_trigram(ετε)
	-0.1127	  char_bigram(θκ)			0.0934	  char_bigram(_α)
	-0.1101	char_trigram(_λα)			0.0934	char_trigram(_βρ)
	-0.1073	char_trigram(_εν)			0.0908	  char_bigram(ισ)
	-0.1033	  char_bigram(εν)			0.0895	char_trigram(τε_)
	-0.0992	char_trigram(_τε)			0.0889	char_trigram(ινα)
	-0.0947	          char(τ)			0.0883	char_trigram(_πρ)
	-0.0925	      word(τζιαι)			0.0883	  char_bigram(να)
	-0.0

### 6.3 Logistic Regression classifier

In [20]:
clf_logisticRegression = LogisticRegression() # Again, dual = True. Default solver = 'liblinear'. It's recommended for smaller databases. For bigger databases, 'saga' could be used.
clf_logisticRegression.fit(train_set_vectors, train_set_labels) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
clf_logisticRegression_predictions = clf_logisticRegression.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_logisticRegression_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_logisticRegression_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_logisticRegression_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.94 

             precision    recall  f1-score   support

         CG       0.95      0.98      0.96       122
        SMG       0.79      0.65      0.71        17

avg / total       0.93      0.94      0.93       139

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  119   |   6   
Actual	     -------- --------
	SMG |   6    |   11  


**It seems that the classification algorithm with the best performance is *Linear Support Vector Machines***.

## 7. Analyzing misclassifications made by the Linear Support Vector classifier

In [22]:
print('MISCLASSIFICATIONS\n')

misclassificationCount = 0

for i, sent in enumerate(test_set_sents):
    if test_set_labels[i] != clf_linearSVC_predictions[i]:
        misclassificationCount += 1
        print(f'{misclassificationCount}.', sent, f'(CORRECT = {test_set_labels[i]},', f'PREDICTED = {clf_linearSVC_predictions[i]})\n')

MISCLASSIFICATIONS

1. δηλαδη εσας τζιαι βιλλο να σας προσφερουν για λυση παλι ναι θα πειτε ατε (CORRECT = CG, PREDICTED = SMG)

2. εκατσε ο υπουργος με τεσσεροις βλακες που καμνουν πως καταλαβουν που κυνηγη και ο υπουργος εκαμνε πως ηξερε που κυνηγη πως καταλαβει τουτοι ουλοι πληρωνοντε με ενα σορο λεφτα που μπορουν να ζησουν πολλες οικογενεις τζε ηβραν την λυση για να σωσουν την κατασταση να κοψουμε τεσσερις εξορμησεις και ελυθηκε το προβλημα (CORRECT = CG, PREDICTED = SMG)

3. ενας δικος μου κυπραιος πρεπει να το διαβασει οπωσδηποτε θα τρελαθει (CORRECT = SMG, PREDICTED = CG)

4. αισθητη η βελτιωση που παρουσιαζει ο αποελ (CORRECT = SMG, PREDICTED = CG)

5. ποια ειναι η καλυτερη μπαλα που χρησιμοποιηθηκε σε μουντιαλ (CORRECT = SMG, PREDICTED = CG)

6. αυτο που ηθελα να θιξω χωρις να συμφωνησω η να διαφωνησω με τους αντρες που θωρουν πορνο ηταν το ποσο πολυ πονο προκαλουν στις συντροφους τους τζιαι επροσπαθησα να το δω λλιο πιο σφαιρικα το θεμα (CORRECT = CG, PREDICTED = SMG)

7. οι 

## 8. Trying the Linear Support Vector classifier with custom input

In [24]:
cgSent = 'Η Κύπρος εν που τες πιο όμορφες χώρες.'
smgSent = 'Η Κύπρος είναι από τις πιο όμορφες χώρες.'

cgSent = get_clean_sent_el(cgSent)
smgSent = get_clean_sent_el(smgSent)

test_vec = count_vect.transform([cgSent, smgSent])
clf_linearSVC.predict(test_vec)

array(['CG', 'SMG'], dtype='<U3')