# Building the Classifier

## 1. Loading the sentences

In [1]:
import re
from nltk import sent_tokenize


def remove_duplicate_end_punctuation(s):
    return re.sub(r'([\.\?!;Õæ])\1+', r'\1 ', s)


def fix_new_sentence_spacing(s):
    return re.sub(r'([a-zŒ±-œâŒØœäŒêœåŒ¨Œ≠œçœãŒ∞ŒÆœé])([\.\?!;Õæ‚Ä¶])([A-ZŒë-Œ©ŒÜŒàŒäŒåŒéŒèŒâ])', r'\1\2 \3', s)


def substitute_greek_question_marks(s):
    return s.replace(';', '?').replace('Õæ', '?')


def clean_punctuation(s):
    s = remove_duplicate_end_punctuation(s)
    s = fix_new_sentence_spacing(s)
    s = substitute_greek_question_marks(s)
    return s


def split_text_into_lines(text):
    text_lines = [p for p in text.split('\n') if p]
    return text_lines


def load_corpus_sentences(paths_list):
    sentences = []
    for path in paths_list:
        with open(path, 'r', encoding='utf-8') as in_file:
            text = clean_punctuation(in_file.read())
            lines = split_text_into_lines(text)
            for line in lines:
                sentences += sent_tokenize(line)
    return sentences


CG_CORPUS_FILE_PATHS = ['./Data/cg_twitter.txt', './Data/cg_fb.txt', './Data/cg_other.txt']

SMG_CORPUS_FILE_PATHS = ['./Data/smg_twitter.txt', './Data/smg_fb.txt', './Data/smg_other.txt']

cg_sents = load_corpus_sentences(CG_CORPUS_FILE_PATHS)
smg_sents = load_corpus_sentences(SMG_CORPUS_FILE_PATHS)

cg_sents[:3]

['Œ†œÅŒ±œÉŒπŒΩŒø Œ±œÖŒ∫ŒøœÖŒπ ŒºŒµœÇ œÑŒø œÄŒ±œÉœáŒ±ŒªŒπŒΩŒø œÄŒøœÑŒÆœÅŒπ œÄŒøœÖ Œ≠œÄŒπŒ±œÉŒµ Œø ŒºŒπœÑœÉŒ∑œÇ #Œ±ŒπœÉœáŒøœÇ ü§£ü§£ü§£   @HARRIS_APOEL https://t.co/y9X7CmBEa5',
 '@HARRIS_APOEL @pirpoitis @vassrules ŒöŒ±ŒºŒΩŒøœÖŒΩ œÄŒøŒªŒªŒ± Œ±ŒΩŒ±Œ∫Œ±ŒπŒΩŒπœÉŒ∑ œÉœÑŒ± Œ†ŒµœÅŒ≤ŒøŒªŒπŒ± .',
 '@MUFCChristian ŒïŒªŒ± œÉœÖŒ≥Œ≥ŒµŒΩŒ∑ œÑŒ∂ŒπŒ±Œπ ŒµœáŒøœÖŒºŒµ ŒΩŒµŒøœÑŒµœÅŒ± œÄ œÑŒø ŒùŒπŒ∫ŒøŒªŒ∑.']

## 2. Cleaning the sentences

In [2]:
import unicodedata
from string import punctuation
from nltk.tokenize import WhitespaceTokenizer

punctuation += '¬¥ŒÑ‚Äì‚Äî‚Äï‚Ä¶‚Äú‚Äù‚Äò‚Äô¬´¬ª¬∑Õæ'


def fix_whitespace_and_strip_punctuation(s):
    tokens = WhitespaceTokenizer().tokenize(s)
    new_tokens = []

    for token in tokens:
        if token == 'Œø,œÑŒπ' or token == 'œå,œÑŒπ' or token == 'o,ti' or token == '√≥,ti':
            new_tokens.append(token)
        else:
            token = re.sub(r'(?<=[.,!\?;\'ŒÑ¬¥‚Äô‚Äî‚Ä¶¬∑Õæ])(?=[^\s])', r' ', token)
            new_token = token.translate(str.maketrans(dict.fromkeys(punctuation, None)))
            if new_token != '':
                new_tokens.append(new_token)

    return ' '.join(new_tokens)


def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


def get_clean_sent_el(sentence):
    sentence = sentence.lower()
    sentence = sentence.replace('œÇ', 'œÉ')
    sentence = strip_accents(sentence)
    sentence = re.sub(r'^RT', '', sentence)
    sentence = re.sub(r'@\w*', '', sentence)
    sentence = re.sub(r'#\w*', '', sentence)
    sentence = re.sub(r'\$\w*', '', sentence)
    sentence = re.sub(r'&\w*;', '', sentence)
    sentence = re.sub(r'https?:\/\/.*\/\w*', '', sentence)
    sentence = ''.join(c for c in sentence if c <= '\uffff')
    sentence = fix_whitespace_and_strip_punctuation(sentence)

    sentence = sentence.replace('\ufeff', '')
    sentence = sentence.strip(' ')
    sentence = sentence.replace('  ', ' ')
    return sentence


cg_sents_clean = []
smg_sents_clean = []

for sent in cg_sents:
    cg_sents_clean.append(get_clean_sent_el(sent))
for sent in smg_sents:
    smg_sents_clean.append(get_clean_sent_el(sent))

cg_sents_clean = list(filter(None, cg_sents_clean))
smg_sents_clean = list(filter(None, smg_sents_clean))

cg_sents_clean[:3]

['œÄœÅŒ±œÉŒπŒΩŒø Œ±œÖŒ∫ŒøœÖŒπ ŒºŒµœÉ œÑŒø œÄŒ±œÉœáŒ±ŒªŒπŒΩŒø œÄŒøœÑŒ∑œÅŒπ œÄŒøœÖ ŒµœÄŒπŒ±œÉŒµ Œø ŒºŒπœÑœÉŒ∑œÉ',
 'Œ∫Œ±ŒºŒΩŒøœÖŒΩ œÄŒøŒªŒªŒ± Œ±ŒΩŒ±Œ∫Œ±ŒπŒΩŒπœÉŒ∑ œÉœÑŒ± œÄŒµœÅŒ≤ŒøŒªŒπŒ±',
 'ŒµŒªŒ± œÉœÖŒ≥Œ≥ŒµŒΩŒ∑ œÑŒ∂ŒπŒ±Œπ ŒµœáŒøœÖŒºŒµ ŒΩŒµŒøœÑŒµœÅŒ± œÄ œÑŒø ŒΩŒπŒ∫ŒøŒªŒ∑']

## 3. Building the feature extractor

NLTK comes with an `everygrams()` function that is able to return n-gram features. However, we will create custom functions for generating word and character n-grams. This is because we want to be able to remove redundant n-grams and also label word and character n-grams to avoid ambiguity.

In [3]:
from nltk import ngrams


def get_word_ngrams(tokens, n):
    ngrams_list = []
    ngrams_list.append(list(ngrams(tokens, n)))
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = '%s'
    for _ in range(1, n):
        format_string += ' %s'
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat


def get_char_ngrams(word, n):
    ngrams_list = []
    word = word.replace('œÇ', 'œÉ')
    ngrams_list.append(
        list(
            ngrams(
                word, n, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_'
            )
        )
    )

    # Remove redundant n-grams:
    if n > 2:
        redundant_combinations = n - 2
        ngrams_list = [
            ngram_list[redundant_combinations:-redundant_combinations] for ngram_list in ngrams_list
        ]

    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = ''
    for _ in range(0, n):
        format_string += '%s'
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

We now proceed with writing the feature extractor function.

In [4]:
def extract_ngram_features(sent):
    sentence_tokens = WhitespaceTokenizer().tokenize(sent)

    features = {}

    # Word unigrams
    ngrams = get_word_ngrams(sentence_tokens, 1)
    for ngram in ngrams:
        features[f'word({ngram})'] = features.get(f'word({ngram})', 0) + 1

    # Word bigrams
    ngrams = get_word_ngrams(sentence_tokens, 2)
    for ngram in ngrams:
        features[f'word_bigram({ngram})'] = features.get(f'word_bigram({ngram})', 0) + 1

    # Char unigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 1)
        for ngram in ngrams:
            features[f'char({ngram})'] = features.get(f'char({ngram})', 0) + 1

    # Char bigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 2)
        for ngram in ngrams:
            features[f'char_bigram({ngram})'] = features.get(f'char_bigram({ngram})', 0) + 1

    # Char trigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 3)
        for ngram in ngrams:
            features[f'char_trigram({ngram})'] = features.get(f'char_trigram({ngram})', 0) + 1

    return features


test_features = extract_ngram_features('Œ±œÖœÑŒ∑ ŒµŒπŒΩŒ±Œπ Œ∑ œÉœÄŒ±œÅœÑŒ∑')

print('EXAMPLE N-GRAM FEATURES FOR THE SENTENCE "ŒëŒ•Œ§Œó ŒïŒôŒùŒëŒô Œó Œ£Œ†ŒëŒ°Œ§Œó"')
print('word(œÉœÄŒ±œÅœÑŒ∑):', test_features['word(œÉœÄŒ±œÅœÑŒ∑)'])
print('word_bigram(Œ±œÖœÑŒ∑ ŒµŒπŒΩŒ±Œπ):', test_features['word_bigram(Œ±œÖœÑŒ∑ ŒµŒπŒΩŒ±Œπ)'])
print('char(Œ±):', test_features['char(Œ±)'])
print('char_bigram(œÑŒ∑):', test_features['char_bigram(œÑŒ∑)'])
print('char_trigram(_ŒµŒπ):', test_features['char_trigram(_ŒµŒπ)'])

EXAMPLE N-GRAM FEATURES FOR THE SENTENCE "ŒëŒ•Œ§Œó ŒïŒôŒùŒëŒô Œó Œ£Œ†ŒëŒ°Œ§Œó"
word(œÉœÄŒ±œÅœÑŒ∑): 1
word_bigram(Œ±œÖœÑŒ∑ ŒµŒπŒΩŒ±Œπ): 1
char(Œ±): 3
char_bigram(œÑŒ∑): 2
char_trigram(_ŒµŒπ): 1


## 4. Creating the training and test sets

In [5]:
import random

RNG_SEED = 42

all_sents_labeled = [(sentence, 'CG') for sentence in cg_sents_clean] + [
    (sentence, 'SMG') for sentence in smg_sents_clean
]
random.Random(RNG_SEED).shuffle(all_sents_labeled)

all_sents_labeled[0]

('œâœÉœÄŒøœÖ ŒΩŒ± œÜŒ±ŒµŒπœÉ œÑŒø Œ∫œÅŒ±ŒºœÄŒπ œÉŒøœÖ ŒµŒΩ ŒΩŒ± œÑŒµŒªŒµŒπœâœÉœâ ŒºŒµŒΩ œÜŒøŒ±œÉŒ±Œπ', 'CG')

In [6]:
TRAIN_TEST_SPLIT = 0.8
TOTAL_SENTENCE_COUNT = len(all_sents_labeled)
TRAIN_SENTENCE_COUNT = round(TOTAL_SENTENCE_COUNT * TRAIN_TEST_SPLIT)

train_set = all_sents_labeled[:TRAIN_SENTENCE_COUNT]
test_set = all_sents_labeled[TRAIN_SENTENCE_COUNT:]

train_set_sents = [sent[0] for sent in train_set]
train_set_labels = [sent[1] for sent in train_set]
test_set_sents = [sent[0] for sent in test_set]
test_set_labels = [sent[1] for sent in test_set]

print(train_set_sents[0], train_set_labels[0])

œâœÉœÄŒøœÖ ŒΩŒ± œÜŒ±ŒµŒπœÉ œÑŒø Œ∫œÅŒ±ŒºœÄŒπ œÉŒøœÖ ŒµŒΩ ŒΩŒ± œÑŒµŒªŒµŒπœâœÉœâ ŒºŒµŒΩ œÜŒøŒ±œÉŒ±Œπ CG


In [7]:
print('DATASET\t', 'SENTENCES')
print('All\t', TOTAL_SENTENCE_COUNT)
print('Training', TRAIN_SENTENCE_COUNT)
print('Test\t', TOTAL_SENTENCE_COUNT - TRAIN_SENTENCE_COUNT)

DATASET	 SENTENCES
All	 1055
Training 844
Test	 211


## 5. Vectorization

We now proceed with vectorizing our data. We will use scikit-learn‚Äôs `CountVectorizer.fit_transform()` on the training set, and `CountVectorizer.transform()` on the test set. Unlike `fit_transform()`, `transform()` does not change the count vectorizer‚Äôs vocabulary, making it ideal for the test set.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=extract_ngram_features)

train_set_vectors = count_vect.fit_transform(train_set_sents)
test_set_vectors = count_vect.transform(test_set_sents)
train_set_vectors

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 128550 stored elements and shape (844, 16460)>

In [9]:
train_set_vectors.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0], shape=(16460,))

Every sentence in the training set is now represented as a vector in `train_set_vectors`. The length of each vector is the total number of n-gram features found in the training set:

In [10]:
len(count_vect.vocabulary_)

16460

In a given vector, each element represents the count of one of the n-gram features in the particular sentence. We can look at the indices of some of these n-gram features:

In [11]:
print('EXAMPLE N-GRAM FEATURE INDICES IN OUR MATRIX')
print('word(ŒΩŒ±):', count_vect.vocabulary_['word(ŒΩŒ±)'])
print('word_bigram(Œ±œÄŒø œÑŒø):', count_vect.vocabulary_['word_bigram(Œ±œÄŒø œÑŒø)'])
print('char(Œ∏):', count_vect.vocabulary_['char(Œ∏)'])
print('char_bigram(ŒøœÖ):', count_vect.vocabulary_['char_bigram(ŒøœÖ)'])
print('char_trigram(œÄŒøŒª):', count_vect.vocabulary_['char_trigram(œÄŒøŒª)'])

EXAMPLE N-GRAM FEATURE INDICES IN OUR MATRIX
word(ŒΩŒ±): 5815
word_bigram(Œ±œÄŒø œÑŒø): 7806
char(Œ∏): 15
char_bigram(ŒøœÖ): 313
char_trigram(œÄŒøŒª): 2650


## 6. Building the classifiers

We will build three different classifiers and compare their performances using the following functions:

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


def show_performance_header_and_accuracy(predictions):
    print('\t\t\tPERFORMANCE\n')
    print('Accuracy:', round(accuracy_score(test_set_labels, predictions), 2), '\n')


def show_confusion_matrix(cm):
    print('\t         Predicted')
    print('\t        CG       SMG')
    print('\t     -------- --------')
    print('\tCG  | {:^6} | {:^6}'.format(cm[0][0], cm[0][1]))
    print('Actual\t     -------- --------')
    print('\tSMG | {:^6} | {:^6}'.format(cm[1][0], cm[1][1]))


def show_most_informative_features(vectorizer, clf, n=10):
    print("\t\t    CG\t\t\t\t\t\t    SMG\n")
    feature_names = vectorizer.get_feature_names_out()

    # https://stackoverflow.com/q/74618563/4304516
    coefs = (
        clf.coef_[0]
        if hasattr(clf, 'coef_')
        else clf.feature_log_prob_[1] - clf.feature_log_prob_[0]
    )
    coefs_with_fns = sorted(zip(coefs, feature_names))

    top = zip(coefs_with_fns[:n], coefs_with_fns[: -(n + 1) : -1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%17s\t\t\t%.4f\t%17s" % (coef_1, fn_1, coef_2, fn_2))

### 6.1 Multinomial Naive Bayes classifier

Our first classifier will be a naive Bayes (NB) classifier, specifically a **multinomial NB classifier** since  multinomial distributions work well for data in the form of counts.

The multinomial NB classifier comes with one [hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) called $ \alpha $, which takes a value in the range $ [0, 1] $. The [multinomial NB formula](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes) assigns a probability of $ 0 $ to features not present in the training set, which is problematic because it will wipe out all information in the other probabilities when they are multiplied. The $ \alpha $ hyperparameter solves this by incorporating a pseudocount (i.e., a small correction) in a process called [additive smoothing](https://en.wikipedia.org/wiki/Additive_smoothing).

While we could use [hyperparameter optimization](https://en.wikipedia.org/wiki/Hyperparameter_optimization) to find the optimal value for $ \alpha $ [as described in this Stack Overflow answer](https://stackoverflow.com/a/72485324/4304516), we will simply keep the default value of $ 1 $. Because our dataset is small, we can expect many possible words and n-gram features to be missing from our training set, so moving the probabilities closer to a uniform distribution is desirable.

In [13]:
clf_multinomialNB = MultinomialNB()
clf_multinomialNB.fit(train_set_vectors, train_set_labels)

0,1,2
,"alpha  alpha: float or array-like of shape (n_features,), default=1.0 Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing).",1.0
,"force_alpha  force_alpha: bool, default=True If False and alpha is less than 1e-10, it will set alpha to 1e-10. If True, alpha will remain unchanged. This may cause numerical errors if alpha is too close to 0. .. versionadded:: 1.2 .. versionchanged:: 1.4  The default value of `force_alpha` changed to `True`.",True
,"fit_prior  fit_prior: bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.",True
,"class_prior  class_prior: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",


In [14]:
clf_multinomialNB_predictions = clf_multinomialNB.predict(test_set_vectors)

show_performance_header_and_accuracy(clf_multinomialNB_predictions)

print(classification_report(test_set_labels, clf_multinomialNB_predictions))

c_matrix = confusion_matrix(test_set_labels, clf_multinomialNB_predictions)
show_confusion_matrix(c_matrix)

			PERFORMANCE

Accuracy: 0.96 

              precision    recall  f1-score   support

          CG       0.96      0.97      0.96       118
         SMG       0.96      0.95      0.95        93

    accuracy                           0.96       211
   macro avg       0.96      0.96      0.96       211
weighted avg       0.96      0.96      0.96       211

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  114   |   4   
Actual	     -------- --------
	SMG |   5    |   88  


In [15]:
show_most_informative_features(count_vect, clf_multinomialNB, n=20)

		    CG						    SMG

	-4.6954	char_trigram(ŒπŒ±Œπ)			2.7406	  char_bigram(œÉŒ∏)
	-4.6764	      word(œÑŒ∂ŒπŒ±Œπ)			2.6761	         word(Œ¥Œµ)
	-4.5483	char_trigram(_œÑŒ∂)			2.6761	char_trigram(ŒπœÉŒ∏)
	-4.2216	char_trigram(œÑŒ∂Œπ)			2.6071	char_trigram(œáŒµœÑ)
	-4.0750	  char_bigram(Œ∏Œ∫)			2.5891	        word(Œ±œÄŒø)
	-4.0393	char_trigram(œÉŒπŒµ)			2.5330	       word(œÄŒøŒªœÖ)
	-3.8874	  char_bigram(œÑŒ∂)			2.5330	          word(Œ∫)
	-3.8606	  char_bigram(œÜŒ∫)			2.5330	    word(Œ≥œÖŒΩŒ±ŒπŒ∫Œ±)
	-3.8606	char_trigram(Œ∏Œ∫Œπ)			2.5330	char_trigram(œÉŒ∏Œ∑)
	-3.8161	char_trigram(Œ∂Œ±Œπ)			2.5330	char_trigram(ŒªœÖ_)
	-3.7208	char_trigram(Œ±ŒºŒΩ)			2.5330	char_trigram(_œÄŒ∑)
	-3.7208	       word(œÑŒ∂Œ±Œπ)			2.5330	char_trigram(_Œ∫_)
	-3.6955	char_trigram(ŒªŒ±Œª)			2.4529	       word(Œ±œÖœÑŒ±)
	-3.6210	         word(ŒµŒΩ)			2.4529	char_trigram(œáŒπœÉ)
	-3.5285	char_trigram(œÖŒªŒª)			2.4529	char_trigram(ŒπœâŒΩ)
	-3.4332	char_trigram(ŒµœÉŒπ)			2.3659	     word(Œ∫Œ±ŒΩŒµŒπœÉ)
	-3.3993	        w

### 6.2 Linear Support Vector classifier

Next, we will try a support vector machine. Since our dataset is small, which results in a relatively small number of features, it is [justifiable](https://stats.stackexchange.com/questions/73032/linear-kernel-and-non-linear-kernel-for-support-vector-machine) to opt for the simplest model, the **linear support vector classifier (SVC)**.

The linear SVC comes with [several hyperparameters](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html):

The `penalty`, `loss`, and `tol` (tolerance) parameters will be left at their standard values. The `dual` parameter will be kept at its default `'auto'` value, which will choose the value of the parameter automatically. In our case, the value of `True` will be chosen because $ n\_samples < n\_features $ in the training set. We will also keep the `C` regularization parameter at its default value of `1` to avoid overfitting due to our small dataset size. The `multi_class` parameter will not be touched since it is only relevant if there are more than two classes. We will leave `fit_intercept` at its default value of `True` to determine the line of best fit to separate our two classes. We will also increase `max_iter` to `1500` from its default value of `1000`. Finally, we will keep `intercept_scaling` and `class_weight` at their default values since our dataset is balanced.

There are two more parameters that are not hyperparameters: `verbose` and `random_state`. The parameter `random_state` takes a seed for RNG. We will provide a value to ensure that our results are reproducible. 

In [16]:
clf_linearSVC = LinearSVC(random_state=RNG_SEED, max_iter=1500)
clf_linearSVC.fit(train_set_vectors, train_set_labels)

0,1,2
,"penalty  penalty: {'l1', 'l2'}, default='l2' Specifies the norm used in the penalization. The 'l2' penalty is the standard used in SVC. The 'l1' leads to ``coef_`` vectors that are sparse.",'l2'
,"loss  loss: {'hinge', 'squared_hinge'}, default='squared_hinge' Specifies the loss function. 'hinge' is the standard SVM loss (used e.g. by the SVC class) while 'squared_hinge' is the square of the hinge loss. The combination of ``penalty='l1'`` and ``loss='hinge'`` is not supported.",'squared_hinge'
,"dual  dual: ""auto"" or bool, default=""auto"" Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features. `dual=""auto""` will choose the value of the parameter automatically, based on the values of `n_samples`, `n_features`, `loss`, `multi_class` and `penalty`. If `n_samples` < `n_features` and optimizer supports chosen `loss`, `multi_class` and `penalty`, then dual will be set to True, otherwise it will be set to False. .. versionchanged:: 1.3  The `""auto""` option is added in version 1.3 and will be the default  in version 1.5.",'auto'
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"multi_class  multi_class: {'ovr', 'crammer_singer'}, default='ovr' Determines the multi-class strategy if `y` contains more than two classes. ``""ovr""`` trains n_classes one-vs-rest classifiers, while ``""crammer_singer""`` optimizes a joint objective over all classes. While `crammer_singer` is interesting from a theoretical perspective as it is consistent, it is seldom used in practice as it rarely leads to better accuracy and is more expensive to compute. If ``""crammer_singer""`` is chosen, the options loss, penalty and dual will be ignored.",'ovr'
,"fit_intercept  fit_intercept: bool, default=True Whether or not to fit an intercept. If set to True, the feature vector is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where 1 corresponds to the intercept. If set to False, no intercept will be used in calculations (i.e. data is expected to be already centered).",True
,"intercept_scaling  intercept_scaling: float, default=1.0 When `fit_intercept` is True, the instance vector x becomes ``[x_1, ..., x_n, intercept_scaling]``, i.e. a ""synthetic"" feature with a constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight. Note that liblinear internally penalizes the intercept, treating it like any other term in the feature vector. To reduce the impact of the regularization on the intercept, the `intercept_scaling` parameter can be set to a value greater than 1; the higher the value of `intercept_scaling`, the lower the impact of regularization on it. Then, the weights become `[w_x_1, ..., w_x_n, w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent the feature weights and the intercept weight is scaled by `intercept_scaling`. This scaling allows the intercept term to have a different regularization behavior compared to the other features.",1
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to ``class_weight[i]*C`` for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",
,"verbose  verbose: int, default=0 Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in liblinear that, if enabled, may not work properly in a multithreaded context.",0


In [17]:
clf_linearSVC_predictions = clf_linearSVC.predict(test_set_vectors)

show_performance_header_and_accuracy(clf_linearSVC_predictions)

print(classification_report(test_set_labels, clf_linearSVC_predictions))

c_matrix = confusion_matrix(test_set_labels, clf_linearSVC_predictions)
show_confusion_matrix(c_matrix)

			PERFORMANCE

Accuracy: 0.94 

              precision    recall  f1-score   support

          CG       0.94      0.96      0.95       118
         SMG       0.95      0.92      0.93        93

    accuracy                           0.94       211
   macro avg       0.94      0.94      0.94       211
weighted avg       0.94      0.94      0.94       211

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  113   |   5   
Actual	     -------- --------
	SMG |   7    |   86  


In [18]:
show_most_informative_features(count_vect, clf_linearSVC, n=20)

		    CG						    SMG

	-0.3556	char_trigram(ŒµŒΩ_)			0.1549	  char_bigram(ŒªœÖ)
	-0.3121	         word(ŒµŒΩ)			0.1480	        word(Œ∫Œ±Œπ)
	-0.2558	char_trigram(_œÑŒ∂)			0.1461	          char(Œ¥)
	-0.2447	  char_bigram(œÑŒ∂)			0.1360	char_trigram(Œ¥ŒµŒΩ)
	-0.2216	  char_bigram(Œ±Œº)			0.1360	        word(Œ¥ŒµŒΩ)
	-0.2088	  char_bigram(_Œµ)			0.1356	  char_bigram(_Œ≤)
	-0.1928	char_trigram(Œ∫Œ±Œº)			0.1313	          char(ŒΩ)
	-0.1737	  char_bigram(Œ∏Œ∫)			0.1301	char_trigram(ŒπŒΩŒ±)
	-0.1652	char_trigram(Œ∏Œ∫Œπ)			0.1250	  char_bigram(œÑŒπ)
	-0.1554	char_trigram(_ŒøœÖ)			0.1239	char_trigram(Œ∏ŒµŒª)
	-0.1510	char_trigram(_ŒµŒΩ)			0.1181	char_trigram(œÑŒ±œÉ)
	-0.1423	char_trigram(ŒªŒ±Œª)			0.1178	  char_bigram(Œ∑_)
	-0.1398	          char(œÄ)			0.1122	          char(Œª)
	-0.1397	char_trigram(œÉŒπŒµ)			0.1118	          char(œá)
	-0.1375	char_trigram(œÑŒπ_)			0.1089	  char_bigram(Œ±Œ∂)
	-0.1355	  char_bigram(Œ∫Œµ)			0.1088	          char(Œπ)
	-0.1353	  char_bigram(ŒΩŒΩ)			0.1075	char_trigr

### 6.3 Logistic Regression classifier

We will also try regression analysis, specifically **logistic regression** since our dependent variable is categorical.

The logistic regression classifier also comes with several hyperparameters:

We will leave `C`, `dual`, `tol`, `fit_intercept`, `intercept_scaling`, and `class_weight` at default values as with the previous classifier. We will set the solver to `'liblinear'`, which is recommended for small datasets. We will leave `l1_ratio` at its default value of `0.0` because we expect the number of irrelevant features to be relatively low due to the small dataset size. We will keep `max_iter` at the default value of `100` due to the small dataset size implying that more than 100 iterations will not be required for the solver to converge.

Just like before, we will provide a seed to the `random_state` parameter. However,  Finally, the `warm_start` parameter is irrelevant when the `liblinear` solver is used.

In [19]:
clf_logisticRegression = LogisticRegression(solver='liblinear', random_state=RNG_SEED)
clf_logisticRegression.fit(train_set_vectors, train_set_labels)

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",42
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'liblinear'


In [20]:
clf_logisticRegression_predictions = clf_logisticRegression.predict(test_set_vectors)

show_performance_header_and_accuracy(clf_logisticRegression_predictions)

print(classification_report(test_set_labels, clf_logisticRegression_predictions))

c_matrix = confusion_matrix(test_set_labels, clf_logisticRegression_predictions)
show_confusion_matrix(c_matrix)

			PERFORMANCE

Accuracy: 0.94 

              precision    recall  f1-score   support

          CG       0.95      0.95      0.95       118
         SMG       0.94      0.94      0.94        93

    accuracy                           0.94       211
   macro avg       0.94      0.94      0.94       211
weighted avg       0.94      0.94      0.94       211

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  112   |   6   
Actual	     -------- --------
	SMG |   6    |   87  


In [21]:
show_most_informative_features(count_vect, clf_logisticRegression, n=20)

		    CG						    SMG

	-1.2569	char_trigram(ŒµŒΩ_)			0.6251	        word(Œ∫Œ±Œπ)
	-1.1476	         word(ŒµŒΩ)			0.5161	  char_bigram(ŒªœÖ)
	-0.9768	char_trigram(_œÑŒ∂)			0.5102	          char(Œ¥)
	-0.9598	  char_bigram(œÑŒ∂)			0.4621	char_trigram(ŒπŒΩŒ±)
	-0.8303	  char_bigram(Œ±Œº)			0.4541	char_trigram(Œ¥ŒµŒΩ)
	-0.7260	  char_bigram(_Œµ)			0.4487	        word(Œ¥ŒµŒΩ)
	-0.6599	char_trigram(_ŒµŒΩ)			0.4417	  char_bigram(_Œ≤)
	-0.6499	char_trigram(Œ∫Œ±Œº)			0.4401	char_trigram(Œ∫Œ±Œπ)
	-0.6232	  char_bigram(Œ∏Œ∫)			0.4285	          char(Œª)
	-0.5460	char_trigram(Œ∏Œ∫Œπ)			0.3955	          char(œá)
	-0.4906	char_trigram(œÑŒ∂Œπ)			0.3871	  char_bigram(Œ∑_)
	-0.4812	  char_bigram(œÜŒ∫)			0.3870	char_trigram(œÄŒø_)
	-0.4758	  char_bigram(Œ∂Œπ)			0.3860	  char_bigram(œÑŒπ)
	-0.4733	          char(œÄ)			0.3732	        word(Œ±œÄŒø)
	-0.4644	char_trigram(_ŒªŒ±)			0.3647	  char_bigram(Œ¥Œµ)
	-0.4618	  char_bigram(Œ∫Œµ)			0.3642	char_trigram(œÑŒ±œÉ)
	-0.4575	char_trigram(_ŒøœÖ)			0.3641	  char_b

**The classification algorithm with the best performance is *Multinomial NB***.

## 7. Analyzing misclassifications made by the Multinomial NB classifier

In [22]:
print('MISCLASSIFICATIONS\n')

misclassificationCount = 0

for i, sent in enumerate(test_set_sents):
    if test_set_labels[i] != clf_multinomialNB_predictions[i]:
        misclassificationCount += 1
        print(
            f'{misclassificationCount}.',
            sent,
            f'(CORRECT = {test_set_labels[i]},',
            f'PREDICTED = {clf_multinomialNB_predictions[i]})\n',
        )

MISCLASSIFICATIONS

1. ŒºŒπŒªŒ±ŒºŒµ ŒøŒªŒ± œÑŒ± ŒªŒµœÜœÑŒ± (CORRECT = SMG, PREDICTED = CG)

2. œÄœÅŒøœÉœÄŒ±Œ∏ŒøœÖœÉŒ± ŒΩŒ± Œ∫Œ±Œ∏Œ±œÅŒπœÉœâ œÑŒø ŒªŒ±ŒπŒºŒø ŒºŒøœÖ Œ∫Œ±Œπ ŒΩŒøŒºŒπŒ∂œâ ŒµŒπœÄŒ± Œ∫Œ±œÑŒπ œÉœÑŒ± ŒµŒ≤œÅŒ±ŒπŒ∫Œ± (CORRECT = SMG, PREDICTED = CG)

3. ŒµœÄŒ±œÅœáŒπŒ± ŒªŒµŒºŒµœÉŒøœÖ Œ∫ ŒµœáœÑŒµœÉ Œ∫Œ±Œπ œÉŒ∑ŒºŒµœÅŒ± Œ∑Œ≤œÅŒ±ŒºŒµ (CORRECT = CG, PREDICTED = SMG)

4. œÉœÑŒ∑ Œ∂œâŒ∑ ŒºŒøœÖ ŒµŒπœÉŒ±Œπ Œ≥ŒøœÖœÅŒπ œÉœÑŒø œÑŒ∂Œ±œÑŒ∂ŒπŒ∫Œπ œÑŒø Œ±Œ≥Œ≥ŒøœÖœÅŒπ (CORRECT = SMG, PREDICTED = CG)

5. œÑœâœÅŒ± ŒºŒøœÖ Œ∑œÅŒ∏Œµ Œ∫Œ±Œπ Œ∑ Œ≥ŒΩœâœÉœÑŒ∑ œÅŒ∑œÉŒ∑ ŒΩŒ± ŒµŒπœÉŒ±Œπ Œ±œÅŒπœÉœÑŒµœÅŒ± Œ±ŒªŒªŒ± ŒΩŒ± œÉŒ∫ŒµœÜœÑŒµœÉŒ±Œπ (CORRECT = SMG, PREDICTED = CG)

6. œÑŒøœÖœÑŒ∑ œÑŒ∑ŒΩ œÄŒµœÅŒπŒøŒ¥Œø ŒøŒºœâœÉ ŒµŒΩ Œ∏Œ±ŒΩŒ±œÑŒøœÉ (CORRECT = CG, PREDICTED = SMG)

7. œÑŒµŒªŒπŒ∫Œ± ŒøŒπ œÄŒøœÅŒΩŒµœÉ Œ¥ŒµŒΩ Œ∑œÑŒ±ŒΩ Œ∫Œ±Œπ œÑŒøœÉŒø Œ±ŒªŒªŒøŒ¥Œ±œÄŒµœÉ Œ±ŒªŒªŒ± œÑŒπ œÉŒ∑ŒºŒ±œÉŒπŒ± ŒµœáŒµŒπ Œ∑ Œ¥ŒøœÖŒªŒµŒπŒ± ŒµŒ≥ŒπŒΩŒµ (CORRECT = SMG, PREDICTED = CG)

8. œÑŒø œÄœÅŒøŒ≤ŒªŒ∑ŒºŒ±ŒΩ ŒµŒπŒΩŒ±Œπ ŒøœÑŒπ œÄœÅŒµœÄŒµŒπ ŒΩŒ± Œ±œ

## 8. Trying the Multinomial Naive Bayes classifier with custom input

First, we build a more powerful version of the classifier using all the available data:

In [23]:
full_set_sents = [sent[0] for sent in all_sents_labeled]
full_set_labels = [sent[1] for sent in all_sents_labeled]
full_set_vectors = count_vect.fit_transform(full_set_sents)

clf_super_multinomialNB = MultinomialNB()
clf_super_multinomialNB.fit(full_set_vectors, full_set_labels)

0,1,2
,"alpha  alpha: float or array-like of shape (n_features,), default=1.0 Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing).",1.0
,"force_alpha  force_alpha: bool, default=True If False and alpha is less than 1e-10, it will set alpha to 1e-10. If True, alpha will remain unchanged. This may cause numerical errors if alpha is too close to 0. .. versionadded:: 1.2 .. versionchanged:: 1.4  The default value of `force_alpha` changed to `True`.",True
,"fit_prior  fit_prior: bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.",True
,"class_prior  class_prior: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",


Trying two custom sentences:

In [24]:
cgSent = 'Œó ŒöœçœÄœÅŒøœÇ ŒµŒΩ œÄŒøœÖ œÑŒµœÇ œÄŒπŒø œåŒºŒøœÅœÜŒµœÇ œáœéœÅŒµœÇ.'
smgSent = 'Œó ŒöœçœÄœÅŒøœÇ ŒµŒØŒΩŒ±Œπ Œ±œÄœå œÑŒπœÇ œÄŒπŒø œåŒºŒøœÅœÜŒµœÇ œáœéœÅŒµœÇ.'

demoSentences = [cgSent, smgSent]

cgSent = get_clean_sent_el(cgSent)
smgSent = get_clean_sent_el(smgSent)

test_vec = count_vect.transform([cgSent, smgSent])

for sentenceNumber, predictionArr in enumerate(clf_super_multinomialNB.predict_proba(test_vec)):
    print(f'SENTENCE {sentenceNumber + 1}: ‚Äú{demoSentences[sentenceNumber]}‚Äù')
    if predictionArr[0] > predictionArr[1]:
        print(f'PREDICTION: Cypriot Greek (Confidence: {predictionArr[0]:.2f})\n')
    else:
        print(f'PREDICTION: Standard Modern Greek (Confidence: {predictionArr[1]:.2f})\n')

SENTENCE 1: ‚ÄúŒó ŒöœçœÄœÅŒøœÇ ŒµŒΩ œÄŒøœÖ œÑŒµœÇ œÄŒπŒø œåŒºŒøœÅœÜŒµœÇ œáœéœÅŒµœÇ.‚Äù
PREDICTION: Cypriot Greek (Confidence: 1.00)

SENTENCE 2: ‚ÄúŒó ŒöœçœÄœÅŒøœÇ ŒµŒØŒΩŒ±Œπ Œ±œÄœå œÑŒπœÇ œÄŒπŒø œåŒºŒøœÅœÜŒµœÇ œáœéœÅŒµœÇ.‚Äù
PREDICTION: Standard Modern Greek (Confidence: 1.00)

