# Building the Classifier

## 1. Loading the corpus

In [1]:
import re
from nltk import sent_tokenize

cg_sents = []
smg_sents = []

def remove_duplicate_punctuation(s):
    return(re.sub(r'(\.|\?|!)\1+', r'\1 ', s))

with open('./Data/cg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)

with open('./Data/smg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)

cg_sents[:3]

['Πρασινο αυκουι μες το πασχαλινο ποτήρι που έπιασε ο μιτσης #αισχος 🤣🤣🤣   @HARRIS_APOEL https://t.co/y9X7CmBEa5',
 '@HARRIS_APOEL @pirpoitis @vassrules Καμνουν ανακαινιση στα Περβολια φετος.',
 '@MUFCChristian Ελα συγγενη τζιαι εχουμε νεοτερα π το Νικολη.']

## 2. Cleaning the text

In [2]:
import unicodedata
from string import punctuation
from nltk.tokenize import WhitespaceTokenizer

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def get_clean_sent_el(sentence):
    sentence = re.sub(r'^RT', '', sentence)
    sentence = re.sub(r'\&\w*;', '', sentence)
    sentence = re.sub(r'\@\w*', '', sentence)
    sentence = re.sub(r'\$\w*', '', sentence)
    sentence = re.sub(r'https?:\/\/.*\/\w*', '', sentence)
    sentence = ''.join(c for c in sentence if c <= '\uFFFF')
    sentence = strip_accents(sentence)
    sentence = re.sub(r'#\w*', '', sentence)
    tokens = WhitespaceTokenizer().tokenize(sentence)
    new_tokens = []
    for token in tokens:
        if token == 'ο,τι' or token == 'ό,τι' or token == 'o,ti' or token == 'ó,ti':
            new_tokens.append(token)
        else:
            new_tokens.append(token.translate(str.maketrans({key: None for key in punctuation})))
    sentence =' '.join(new_tokens)
    sentence = sentence.strip(' ')
    return sentence.lower()

cg_sents_clean = []
smg_sents_clean = []

for sent in cg_sents:
    cg_sents_clean.append(get_clean_sent_el(sent))
for sent in smg_sents:
    smg_sents_clean.append(get_clean_sent_el(sent))

cg_sents_clean = list(filter(None, cg_sents_clean))
smg_sents_clean = list(filter(None, smg_sents_clean))
cg_sents_clean[:3]

['πρασινο αυκουι μες το πασχαλινο ποτηρι που επιασε ο μιτσης',
 'καμνουν ανακαινιση στα περβολια φετος',
 'ελα συγγενη τζιαι εχουμε νεοτερα π το νικολη']

## 3. Building the feature extractor

In [3]:
from nltk import ngrams

def get_word_ngrams(tokens, n):
    ngrams_list = []
    ngrams_list.append(list(ngrams(tokens, n)))
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = '%s'
    for i in range(1, n):
        format_string += (' %s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

def get_char_ngrams(word, n):
    ngrams_list = []
    word = re.sub(r'ς', 'σ', word)
    ngrams_list.append(list(ngrams(word, n, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_')))
    
    # Removing redundant ngrams:
    if (n > 2):
        redundant_combinations = n - 2
        ngrams_list = [ngram_list[redundant_combinations : -redundant_combinations] for ngram_list in ngrams_list]
    
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = ''
    for i in range(0, n):
        format_string += ('%s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

In [4]:
# Feature extractor
def get_ngram_features(sent): # The reason I do not use NLTK's everygrams to extract the features quickly is because the behavior of my n-gram extractor is modified to remove redundant n-grams. Also, I need to label word and char n-grams to avoid ambiguity
    sentence_tokens = WhitespaceTokenizer().tokenize(sent)
    
    features = {}
    
    # Word unigrams
    ngrams = get_word_ngrams(sentence_tokens, 1)
    for ngram in ngrams:
        features[f'word({ngram})'] = features.get(f'word({ngram})', 0) + 1 # The second parameter to .get() is a default value if the key doesn't exist.
    
    # Word bigrams
    ngrams = get_word_ngrams(sentence_tokens, 2)
    for ngram in ngrams:
        features[f'word_bigram({ngram})'] = features.get(f'word_bigram({ngram})', 0) + 1
    
    # Char unigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 1)
        for ngram in ngrams:
            features[f'char({ngram})'] = features.get(f'char({ngram})', 0) + 1
    
    # Char bigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 2)
        for ngram in ngrams:
            features[f'char_bigram({ngram})'] = features.get(f'char_bigram({ngram})', 0) + 1
    
    # Char trigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 3)
        for ngram in ngrams:
            features[f'char_trigram({ngram})'] = features.get(f'char_trigram({ngram})', 0) + 1
    
    return features

get_ngram_features('αυτη ειναι η σπαρτη')

{'char(α)': 3,
 'char(ε)': 1,
 'char(η)': 3,
 'char(ι)': 2,
 'char(ν)': 1,
 'char(π)': 1,
 'char(ρ)': 1,
 'char(σ)': 1,
 'char(τ)': 2,
 'char(υ)': 1,
 'char_bigram(_α)': 1,
 'char_bigram(_ε)': 1,
 'char_bigram(_η)': 1,
 'char_bigram(_σ)': 1,
 'char_bigram(αι)': 1,
 'char_bigram(αρ)': 1,
 'char_bigram(αυ)': 1,
 'char_bigram(ει)': 1,
 'char_bigram(η_)': 3,
 'char_bigram(ι_)': 1,
 'char_bigram(ιν)': 1,
 'char_bigram(να)': 1,
 'char_bigram(πα)': 1,
 'char_bigram(ρτ)': 1,
 'char_bigram(σπ)': 1,
 'char_bigram(τη)': 2,
 'char_bigram(υτ)': 1,
 'char_trigram(_αυ)': 1,
 'char_trigram(_ει)': 1,
 'char_trigram(_η_)': 1,
 'char_trigram(_σπ)': 1,
 'char_trigram(αι_)': 1,
 'char_trigram(αρτ)': 1,
 'char_trigram(αυτ)': 1,
 'char_trigram(ειν)': 1,
 'char_trigram(ινα)': 1,
 'char_trigram(ναι)': 1,
 'char_trigram(παρ)': 1,
 'char_trigram(ρτη)': 1,
 'char_trigram(σπα)': 1,
 'char_trigram(τη_)': 2,
 'char_trigram(υτη)': 1,
 'word(αυτη)': 1,
 'word(ειναι)': 1,
 'word(η)': 1,
 'word(σπαρτη)': 1,
 'word_bigra

In [5]:
# from nltk import everygrams

# def sent_process(sent):
#     return [''.join(ng) for ng in everygrams(sent.replace(' ', '_ _'), 1, 4) 
#             if ' ' not in ng and '\n' not in ng and ng != ('_',)]

# sent_process('αυτη ειναι η σπαρτη')

## 4. Creating the training and test sets

In [6]:
import random

all_sents_labeled = ([(sentence, 'CG') for sentence in cg_sents_clean] + [(sentence, 'SMG') for sentence in smg_sents_clean])
random.shuffle(all_sents_labeled)
all_sents_labeled[0]

('μολις ψηφιστηκε στη γαλλια νομος π ποινικοποιει τους πελατες της πορνειας',
 'SMG')

In [8]:
NO_ALL_SENTENCES = len(all_sents_labeled)
NO_TRAIN_SENTENCES = round(NO_ALL_SENTENCES * .8)

train_set = all_sents_labeled[:NO_TRAIN_SENTENCES]
test_set = all_sents_labeled[NO_TRAIN_SENTENCES:]

train_set_sents = [sent[0] for sent in train_set]
train_set_labels = [sent[1] for sent in train_set]
test_set_sents = [sent[0] for sent in test_set]
test_set_labels = [sent[1] for sent in test_set]

print(train_set_sents[0], train_set_labels[0])

μολις ψηφιστηκε στη γαλλια νομος π ποινικοποιει τους πελατες της πορνειας SMG


In [9]:
print('DATASET\t', 'SENTENCES')
print('All\t', NO_ALL_SENTENCES)
print('Training', NO_TRAIN_SENTENCES)
print('Testing\t', NO_ALL_SENTENCES - NO_TRAIN_SENTENCES)

DATASET	 SENTENCES
All	 162
Training 130
Testing	 32


## 5. Vectorization

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=get_ngram_features)

train_set_vectors = count_vect.fit_transform(train_set_sents)
test_set_vectors = count_vect.transform(test_set_sents) # Unlike fit_transform(), transform() does not change the count vectorizer's vocabulary so it should be used for the test set.
train_set_vectors

<130x4006 sparse matrix of type '<class 'numpy.int64'>'
	with 16502 stored elements in Compressed Sparse Row format>

In [11]:
from numpy import set_printoptions, nan
set_printoptions(threshold=nan) # Prints whole array. Required because by default an array with thousands of elements wouldn't be printed in full.

train_set_vectors.toarray()[0]

array([0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,

In [12]:
count_vect.vocabulary_ # The numbers are not counts but indices.

{'word(μολις)': 2408,
 'word(ψηφιστηκε)': 2721,
 'word(στη)': 2600,
 'word(γαλλια)': 2070,
 'word(νομος)': 2436,
 'word(π)': 2473,
 'word(ποινικοποιει)': 2526,
 'word(τους)': 2651,
 'word(πελατες)': 2499,
 'word(της)': 2640,
 'word(πορνειας)': 2531,
 'word_bigram(μολις ψηφιστηκε)': 3357,
 'word_bigram(ψηφιστηκε στη)': 3999,
 'word_bigram(στη γαλλια)': 3735,
 'word_bigram(γαλλια νομος)': 2823,
 'word_bigram(νομος π)': 3452,
 'word_bigram(π ποινικοποιει)': 3535,
 'word_bigram(ποινικοποιει τους)': 3590,
 'word_bigram(τους πελατες)': 3929,
 'word_bigram(πελατες της)': 3565,
 'word_bigram(της πορνειας)': 3854,
 'char(μ)': 13,
 'char(ο)': 16,
 'char(λ)': 12,
 'char(ι)': 10,
 'char(σ)': 19,
 'char(ψ)': 24,
 'char(η)': 8,
 'char(φ)': 22,
 'char(τ)': 20,
 'char(κ)': 11,
 'char(ε)': 6,
 'char(γ)': 4,
 'char(α)': 2,
 'char(ν)': 14,
 'char(π)': 17,
 'char(υ)': 21,
 'char(ρ)': 18,
 'char_bigram(_μ)': 37,
 'char_bigram(μο)': 219,
 'char_bigram(ολ)': 252,
 'char_bigram(λι)': 203,
 'char_bigram(ισ)': 

In [13]:
len(count_vect.vocabulary_) # This is the same as the length of each vector.

4006

## 6. Building the classifiers

In [68]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def show_confusion_matrix(cm):
    print('\t         Predicted')
    print('\t        CG       SMG')
    print('\t     -------- --------')
    print('\tCG  | {:^6} | {:^6}'.format(cm[0][0], cm[1][0]))
    print('Actual\t     -------- --------')
    print('\tSMG | {:^6} | {:^6}'.format(cm[1][0], cm[1][1]))

def show_most_informative_features(vectorizer, clf, n=10):
    print("\t\t    CG\t\t\t\t\t\t    SMG\n")
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%17s\t\t\t%.4f\t%17s" % (coef_1, fn_1, coef_2, fn_2))

### 6.1 Multinomial Naive Bayes classifier

In [15]:
clf_multinomialNB = MultinomialNB() # There are no params for MultinomialDB that prevent overfitting, so any overfitting is caused by the small dataset size.
clf_multinomialNB.fit(train_set_vectors, train_set_labels) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
clf_multinomialNB_predictions = clf_multinomialNB.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_multinomialNB_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_multinomialNB_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_multinomialNB_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.84 

             precision    recall  f1-score   support

         CG       0.84      1.00      0.91        26
        SMG       1.00      0.17      0.29         6

avg / total       0.87      0.84      0.79        32

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |   26   |   5   
Actual	     -------- --------
	SMG |   5    |   1   


In [69]:
show_most_informative_features(count_vect, clf_multinomialNB, n=20)

		    CG						    SMG

	-9.0807	  char_bigram(_ω)			-5.5253	          char(ι)
	-9.0807	  char_bigram(αη)			-5.5253	          char(α)
	-9.0807	  char_bigram(αο)			-5.5543	          char(ο)
	-9.0807	  char_bigram(αω)			-5.5842	  char_bigram(α_)
	-9.0807	  char_bigram(β_)			-5.5842	          char(τ)
	-9.0807	  char_bigram(βη)			-5.6150	          char(σ)
	-9.0807	  char_bigram(βι)			-5.6150	          char(ν)
	-9.0807	  char_bigram(γλ)			-5.6467	          char(υ)
	-9.0807	  char_bigram(γν)			-5.6467	          char(λ)
	-9.0807	  char_bigram(γω)			-5.6467	          char(ε)
	-9.0807	  char_bigram(δρ)			-5.6795	          char(κ)
	-9.0807	  char_bigram(εω)			-5.7134	          char(ρ)
	-9.0807	  char_bigram(ζα)			-5.7134	          char(π)
	-9.0807	  char_bigram(ζη)			-5.7134	          char(μ)
	-9.0807	  char_bigram(ηβ)			-5.7485	          char(η)
	-9.0807	  char_bigram(ηε)			-5.8226	  char_bigram(ου)
	-9.0807	  char_bigram(ηλ)			-5.8226	  char_bigram(ι_)
	-9.0807	  char_bigram(ηξ)			-5.8618	  cha

### 6.2 Linear Support Vector classifier

In [21]:
clf_linearSVC = LinearSVC(max_iter=1500) # n_samples < n_features in training set so the dual param is kept at its default value of True. Default max_iter = 1000
clf_linearSVC.fit(train_set_vectors, train_set_labels) 

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1500,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [23]:
clf_linearSVC_predictions = clf_linearSVC.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_linearSVC_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_linearSVC_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_linearSVC_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.91 

             precision    recall  f1-score   support

         CG       0.93      0.96      0.94        26
        SMG       0.80      0.67      0.73         6

avg / total       0.90      0.91      0.90        32

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |   25   |   2   
Actual	     -------- --------
	SMG |   2    |   4   


In [70]:
show_most_informative_features(count_vect, clf_linearSVC, n=20)

		    CG						    SMG

	-0.1454	          char(ε)			0.1053	  char_bigram(στ)
	-0.1316	char_trigram(εν_)			0.0994	  char_bigram(_μ)
	-0.1262	         word(εν)			0.0869	char_trigram(_απ)
	-0.1194	char_trigram(καμ)			0.0855	  char_bigram(μι)
	-0.1156	  char_bigram(_ε)			0.0833	  char_bigram(_ν)
	-0.1072	  char_bigram(αμ)			0.0778	  char_bigram(λα)
	-0.1009	  char_bigram(τζ)			0.0777	          char(ι)
	-0.0977	char_trigram(_τζ)			0.0775	char_trigram(σαι)
	-0.0867	  char_bigram(σ_)			0.0738	          char(ξ)
	-0.0862	  char_bigram(μα)			0.0693	char_trigram(το_)
	-0.0774	          char(γ)			0.0688	  char_bigram(τα)
	-0.0751	char_trigram(τζι)			0.0688	  char_bigram(ικ)
	-0.0732	char_trigram(ον_)			0.0673	  char_bigram(ιζ)
	-0.0705	  char_bigram(ασ)			0.0655	char_trigram(στο)
	-0.0696	          char(μ)			0.0648	word_bigram(σαι καλα)
	-0.0670	char_trigram(_το)			0.0648	word_bigram(να σαι)
	-0.0665	char_trigram(_εν)			0.0648	        word(σαι)
	-0.0662	  char_bigram(ε_)			0.0628	  char_bigram(θα)

### 6.3 Logistic Regression classifier

In [72]:
clf_logisticRegression = LogisticRegression() # Again, dual = True. Default solver = 'liblinear'. It's recommended for smaller databases. For bigger databases, 'saga' could be used.
clf_logisticRegression.fit(train_set_vectors, train_set_labels) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [73]:
clf_logisticRegression_predictions = clf_logisticRegression.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_logisticRegression_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_logisticRegression_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_logisticRegression_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.88 

             precision    recall  f1-score   support

         CG       0.89      0.96      0.93        26
        SMG       0.75      0.50      0.60         6

avg / total       0.87      0.88      0.86        32

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |   25   |   3   
Actual	     -------- --------
	SMG |   3    |   3   


**It seems that the classification algorithm with the best performance is *Linear Support Vector Machines***.

## 7. Analyzing misclassifications made by the Linear Support Vector classifier

In [79]:
print('MISCLASSIFICATIONS\n')

misclassificationCount = 0

for i, sent in enumerate(test_set_sents):
    if test_set_labels[i] != clf_linearSVC_predictions[i]:
        misclassificationCount += 1
        print(f'{misclassificationCount}.', sent, f'(CORRECT = {test_set_labels[i]},', f'PREDICTED = {clf_linearSVC_predictions[i]})\n')

MISCLASSIFICATIONS

1. γινεται οταν εχετε περιοδο να πηγαινετε καπου αλλου μακρυα (CORRECT = SMG, PREDICTED = CG)

2. οριστε μας βρηκαν τα χριστουγεννα κι ακομη να χασουμε τα κιλα των διακοπων (CORRECT = SMG, PREDICTED = CG)

3. τον λογο καταλαβαινετε τον νομιζω και σιουρα εν θα επιαννα τον κουμπαρο μου στο λαιμο μου για ενναν βλακα αφου ηταν απο το επαγγελματικο του περιβαλλον και σημερα ξερετε τα ουλλοι οτι οι δουλειες ειναι αφαντες (CORRECT = CG, PREDICTED = SMG)



## 8. Trying the Linear Support Vector classifier with custom input

In [80]:
cgSent = 'Η Κύπρος εν που τες πιο όμορφες χώρες.'
smgSent = 'Η Κύπρος είναι από τις πιο όμορφες χώρες.'

cgSent = get_clean_sent_el(cgSent)
smgSent = get_clean_sent_el(smgSent)

test_vec = count_vect.transform([cgSent, smgSent])
clf_linearSVC.predict(test_vec)

array(['CG', 'SMG'], dtype='<U3')