# Building the Classifier

## 1. Loading the corpus

In [1]:
import re
from nltk import sent_tokenize

cg_sents = []
smg_sents = []

def remove_duplicate_punctuation(s):
    return(re.sub(r'(\.|\?|!)\1+', r'\1 ', s))

with open('./Data/cg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)

with open('./Data/smg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)

cg_sents[:3]

['Πρασινο αυκουι μες το πασχαλινο ποτήρι που έπιασε ο μιτσης #αισχος 🤣🤣🤣   @HARRIS_APOEL https://t.co/y9X7CmBEa5',
 '@HARRIS_APOEL @pirpoitis @vassrules Καμνουν ανακαινιση στα Περβολια φετος.',
 '@MUFCChristian Ελα συγγενη τζιαι εχουμε νεοτερα π το Νικολη.']

## 2. Cleaning the text

In [2]:
import unicodedata
from string import punctuation
from nltk.tokenize import WhitespaceTokenizer

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def get_clean_sent_el(sentence):
    sentence = re.sub(r'^RT', '', sentence)
    sentence = re.sub(r'\&\w*;', '', sentence)
    sentence = re.sub(r'\@\w*', '', sentence)
    sentence = re.sub(r'\$\w*', '', sentence)
    sentence = re.sub(r'https?:\/\/.*\/\w*', '', sentence)
    sentence = ''.join(c for c in sentence if c <= '\uFFFF')
    sentence = strip_accents(sentence)
    sentence = re.sub(r'#\w*', '', sentence)
    tokens = WhitespaceTokenizer().tokenize(sentence)
    new_tokens = []
    for token in tokens:
        if token == 'ο,τι' or token == 'ό,τι' or token == 'o,ti' or token == 'ó,ti':
            new_tokens.append(token)
        else:
            new_tokens.append(token.translate(str.maketrans({key: None for key in punctuation})))
    sentence =' '.join(new_tokens)
    sentence = sentence.strip(' ')
    return sentence.lower()

cg_sents_clean = []
smg_sents_clean = []

for sent in cg_sents:
    cg_sents_clean.append(get_clean_sent_el(sent))
for sent in smg_sents:
    smg_sents_clean.append(get_clean_sent_el(sent))

cg_sents_clean = list(filter(None, cg_sents_clean))
smg_sents_clean = list(filter(None, smg_sents_clean))
cg_sents_clean[:3]

['πρασινο αυκουι μες το πασχαλινο ποτηρι που επιασε ο μιτσης',
 'καμνουν ανακαινιση στα περβολια φετος',
 'ελα συγγενη τζιαι εχουμε νεοτερα π το νικολη']

## 3. Building the feature extractor

In [3]:
from nltk import ngrams

def get_word_ngrams(tokens, n):
    ngrams_list = []
    ngrams_list.append(list(ngrams(tokens, n)))
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = '%s'
    for i in range(1, n):
        format_string += (' %s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

def get_char_ngrams(word, n):
    ngrams_list = []
    word = re.sub(r'ς', 'σ', word)
    ngrams_list.append(list(ngrams(word, n, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_')))
    
    # Removing redundant ngrams:
    if (n > 2):
        redundant_combinations = n - 2
        ngrams_list = [ngram_list[redundant_combinations : -redundant_combinations] for ngram_list in ngrams_list]
    
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = ''
    for i in range(0, n):
        format_string += ('%s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

In [4]:
# Feature extractor
def get_ngram_features(sent): # The reason I do not use NLTK's everygrams to extract the features quickly is because the behavior of my n-gram extractor is modified to remove redundant n-grams. Also, I need to label word and char n-grams to avoid ambiguity
    sentence_tokens = WhitespaceTokenizer().tokenize(sent)
    
    features = {}
    
    # Word unigrams
    ngrams = get_word_ngrams(sentence_tokens, 1)
    for ngram in ngrams:
        features[f'word({ngram})'] = features.get(f'word({ngram})', 0) + 1 # The second parameter to .get() is a default value if the key doesn't exist.
    
    # Word bigrams
    ngrams = get_word_ngrams(sentence_tokens, 2)
    for ngram in ngrams:
        features[f'word_bigram({ngram})'] = features.get(f'word_bigram({ngram})', 0) + 1
    
    # Word trigrams
    ngrams = get_word_ngrams(sentence_tokens, 3)
    for ngram in ngrams:
        features[f'word_trigram({ngram})'] = features.get(f'word_trigram({ngram})', 0) + 1
    
    # Word quadrigrams
    ngrams = get_word_ngrams(sentence_tokens, 4)
    for ngram in ngrams:
        features[f'word_quadrigram({ngram})'] = features.get(f'word_quadrigram({ngram})', 0) + 1
    
    # Char unigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 1)
        for ngram in ngrams:
            features[f'char({ngram})'] = features.get(f'char({ngram})', 0) + 1
    
    # Char bigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 2)
        for ngram in ngrams:
            features[f'char_bigram({ngram})'] = features.get(f'char_bigram({ngram})', 0) + 1
    
    # Char trigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 3)
        for ngram in ngrams:
            features[f'char_trigram({ngram})'] = features.get(f'char_trigram({ngram})', 0) + 1
    
    # Char quadrigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 4)
        for ngram in ngrams:
            features[f'char_quadrigram({ngram})'] = features.get(f'char_quadrigram({ngram})', 0) + 1
    
    return features

get_ngram_features('αυτη ειναι η σπαρτη')

{'char(α)': 3,
 'char(ε)': 1,
 'char(η)': 3,
 'char(ι)': 2,
 'char(ν)': 1,
 'char(π)': 1,
 'char(ρ)': 1,
 'char(σ)': 1,
 'char(τ)': 2,
 'char(υ)': 1,
 'char_bigram(_α)': 1,
 'char_bigram(_ε)': 1,
 'char_bigram(_η)': 1,
 'char_bigram(_σ)': 1,
 'char_bigram(αι)': 1,
 'char_bigram(αρ)': 1,
 'char_bigram(αυ)': 1,
 'char_bigram(ει)': 1,
 'char_bigram(η_)': 3,
 'char_bigram(ι_)': 1,
 'char_bigram(ιν)': 1,
 'char_bigram(να)': 1,
 'char_bigram(πα)': 1,
 'char_bigram(ρτ)': 1,
 'char_bigram(σπ)': 1,
 'char_bigram(τη)': 2,
 'char_bigram(υτ)': 1,
 'char_quadrigram(_αυτ)': 1,
 'char_quadrigram(_ειν)': 1,
 'char_quadrigram(_σπα)': 1,
 'char_quadrigram(αρτη)': 1,
 'char_quadrigram(αυτη)': 1,
 'char_quadrigram(εινα)': 1,
 'char_quadrigram(ιναι)': 1,
 'char_quadrigram(ναι_)': 1,
 'char_quadrigram(παρτ)': 1,
 'char_quadrigram(ρτη_)': 1,
 'char_quadrigram(σπαρ)': 1,
 'char_quadrigram(υτη_)': 1,
 'char_trigram(_αυ)': 1,
 'char_trigram(_ει)': 1,
 'char_trigram(_η_)': 1,
 'char_trigram(_σπ)': 1,
 'char_trig

In [5]:
# from nltk import everygrams

# def sent_process(sent):
#     return [''.join(ng) for ng in everygrams(sent.replace(' ', '_ _'), 1, 4) 
#             if ' ' not in ng and '\n' not in ng and ng != ('_',)]

# sent_process('αυτη ειναι η σπαρτη')

## 4. Creating the training and test sets

In [6]:
import random

all_sents_labeled = ([(sentence, 'cg') for sentence in cg_sents_clean] + [(sentence, 'smg') for sentence in smg_sents_clean])
random.shuffle(all_sents_labeled)
all_sents_labeled[0]

('αηστους τσιαμε εγιω σιερομαι που εν ετσι μαππουροι τσιλλιαραες', 'cg')

In [7]:
NO_ALL_SENTENCES = len(all_sents_labeled)
NO_TRAIN_SENTENCES = round(NO_ALL_SENTENCES * .8)

train_set = all_sents_labeled[:NO_TRAIN_SENTENCES]
test_set = all_sents_labeled[NO_TRAIN_SENTENCES:]

train_set_sents = [sent[0] for sent in train_set]
train_set_labels = [sent[1] for sent in train_set]
test_set_sents = [sent[0] for sent in test_set]
test_set_labels = [sent[1] for sent in test_set]

print(train_set_sents[0], train_set_labels[0])

αηστους τσιαμε εγιω σιερομαι που εν ετσι μαππουροι τσιλλιαραες cg


In [8]:
print('DATASET\t', 'SENTENCES')
print('All\t', NO_ALL_SENTENCES)
print('Training', NO_TRAIN_SENTENCES)
print('Testing\t', NO_ALL_SENTENCES - NO_TRAIN_SENTENCES)

DATASET	 SENTENCES
All	 162
Training 130
Testing	 32


## 5. Vectorization

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=get_ngram_features)

train_set_vectors = count_vect.fit_transform(train_set_sents)
test_set_vectors = count_vect.transform(test_set_sents) # Unlike fit_transform(), transform() does not change the count vectorizer's vocabulary so it should be used for the test set.
train_set_vectors

  """


<130x9299 sparse matrix of type '<class 'numpy.int64'>'
	with 24990 stored elements in Compressed Sparse Row format>

In [10]:
import numpy
numpy.set_printoptions(threshold=numpy.nan)
train_set_vectors.toarray()[0]

array([0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [11]:
count_vect.vocabulary_ # The numbers are not counts but indices

{'word(αηστους)': 4718,
 'word(τσιαμε)': 5440,
 'word(εγιω)': 4863,
 'word(σιερομαι)': 5354,
 'word(που)': 5315,
 'word(εν)': 4908,
 'word(ετσι)': 4957,
 'word(μαππουροι)': 5131,
 'word(τσιλλιαραες)': 5441,
 'word_bigram(αηστους τσιαμε)': 5502,
 'word_bigram(τσιαμε εγιω)': 6795,
 'word_bigram(εγιω σιερομαι)': 5680,
 'word_bigram(σιερομαι που)': 6545,
 'word_bigram(που εν)': 6461,
 'word_bigram(εν ετσι)': 5747,
 'word_bigram(ετσι μαππουροι)': 5838,
 'word_bigram(μαππουροι τσιλλιαραες)': 6091,
 'word_trigram(αηστους τσιαμε εγιω)': 8015,
 'word_trigram(τσιαμε εγιω σιερομαι)': 9246,
 'word_trigram(εγιω σιερομαι που)': 8190,
 'word_trigram(σιερομαι που εν)': 9019,
 'word_trigram(που εν ετσι)': 8935,
 'word_trigram(εν ετσι μαππουροι)': 8250,
 'word_trigram(ετσι μαππουροι τσιλλιαραες)': 8342,
 'word_quadrigram(αηστους τσιαμε εγιω σιερομαι)': 6855,
 'word_quadrigram(τσιαμε εγιω σιερομαι που)': 7965,
 'word_quadrigram(εγιω σιερομαι που εν)': 7019,
 'word_quadrigram(σιερομαι που εν ετσι)': 7761,

In [12]:
len(count_vect.vocabulary_)

9299

## 6. Building the classifiers

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *

def show_most_informative_features(vectorizer, clf, n=10):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

### 6.1 Multinomial Naive Bayes classifier

In [14]:
clf_multinomialNB = MultinomialNB() # Thereare no params for MultinomialDB that prevent overfitting, so any overfitting is caused by the small dataset size.
clf_multinomialNB.fit(train_set_vectors, train_set_labels) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
print('PERFORMANCE')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_multinomialNB.predict(test_set_vectors)), 2))

PERFORMANCE
Accuracy: 0.84


In [16]:
print("\tCG\t\t\t\tSMG\n")
show_most_informative_features(count_vect, clf_multinomialNB, n=20)

	CG				SMG

	-9.7298	char_bigram(αη)		-6.1463	char(ι)        
	-9.7298	char_bigram(αο)		-6.1463	char(α)        
	-9.7298	char_bigram(αω)		-6.2034	char_bigram(α_)
	-9.7298	char_bigram(β_)		-6.2034	char(τ)        
	-9.7298	char_bigram(βη)		-6.2034	char(ο)        
	-9.7298	char_bigram(βι)		-6.2333	char(σ)        
	-9.7298	char_bigram(γλ)		-6.2333	char(ν)        
	-9.7298	char_bigram(γω)		-6.2333	char(ε)        
	-9.7298	char_bigram(εω)		-6.2641	char(λ)        
	-9.7298	char_bigram(ζα)		-6.2958	char(υ)        
	-9.7298	char_bigram(ζη)		-6.2958	char(μ)        
	-9.7298	char_bigram(ζι)		-6.2958	char(κ)        
	-9.7298	char_bigram(ηβ)		-6.2958	char(η)        
	-9.7298	char_bigram(ηε)		-6.3286	char(ρ)        
	-9.7298	char_bigram(ηξ)		-6.3286	char(π)        
	-9.7298	char_bigram(ηρ)		-6.3976	char_bigram(ι_)
	-9.7298	char_bigram(ηχ)		-6.4717	char_bigram(ου)
	-9.7298	char_bigram(ηψ)		-6.5109	char_bigram(_μ)
	-9.7298	char_bigram(θκ)		-6.5109	char_bigram(_κ)
	-9.7298	char_bigram(θρ)		-6.5109	char

### 6.2 Linear Support Vector classifier

### 6.3 Logistic Regression classifier

## 7. Analyzing misclassifications made by XYZ

## 8. Trying the XYZ classifier with custom input

In [41]:
cgSent = 'Η Κύπρος εν που τες πιο όμορφες χώρες.'
smgSent = 'Η Κύπρος είναι από τις πιο όμορφες χώρες.'

cgSent = get_clean_sent_el(cgSent)
smgSent = get_clean_sent_el(smgSent)

test_vec = count_vect.transform([cgSent, smgSent])
clf_XYZ.predict(test_vec)

NameError: name 'XYZclf' is not defined