# Building the Classifier

## 1. Loading the corpus

In [87]:
import re
from nltk import sent_tokenize

cg_sents = []
smg_sents = []

def remove_duplicate_punctuation(s):
    return(re.sub(r'(\.|\?|!|;)\1+', r'\1 ', s))

with open('./Data/cg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)

with open('./Data/smg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)

cg_sents[:3]

['Œ†œÅŒ±œÉŒπŒΩŒø Œ±œÖŒ∫ŒøœÖŒπ ŒºŒµœÇ œÑŒø œÄŒ±œÉœáŒ±ŒªŒπŒΩŒø œÄŒøœÑŒÆœÅŒπ œÄŒøœÖ Œ≠œÄŒπŒ±œÉŒµ Œø ŒºŒπœÑœÉŒ∑œÇ #Œ±ŒπœÉœáŒøœÇ ü§£ü§£ü§£   @HARRIS_APOEL https://t.co/y9X7CmBEa5',
 '@HARRIS_APOEL @pirpoitis @vassrules ŒöŒ±ŒºŒΩŒøœÖŒΩ œÄŒøŒªŒªŒ± Œ±ŒΩŒ±Œ∫Œ±ŒπŒΩŒπœÉŒ∑ œÉœÑŒ± Œ†ŒµœÅŒ≤ŒøŒªŒπŒ± .',
 '@MUFCChristian ŒïŒªŒ± œÉœÖŒ≥Œ≥ŒµŒΩŒ∑ œÑŒ∂ŒπŒ±Œπ ŒµœáŒøœÖŒºŒµ ŒΩŒµŒøœÑŒµœÅŒ± œÄ œÑŒø ŒùŒπŒ∫ŒøŒªŒ∑.']

## 2. Cleaning the text

In [113]:
import unicodedata
from string import punctuation
from nltk.tokenize import WhitespaceTokenizer

punctuation += 'ŒÑ‚Ä¶‚Äú‚Äù‚Äì‚Äî‚Äï'

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def get_clean_sent_el(sentence):
    sentence = re.sub(r'^RT', '', sentence)
    sentence = re.sub(r'\&\w*;', '', sentence)
    sentence = re.sub(r'\@\w*', '', sentence)
    sentence = re.sub(r'\$\w*', '', sentence)
    sentence = re.sub(r'https?:\/\/.*\/\w*', '', sentence)
    sentence = ''.join(c for c in sentence if c <= '\uFFFF')
    sentence = strip_accents(sentence)
    sentence = re.sub(r'#\w*', '', sentence)
    sentence = sentence.lower()
    tokens = WhitespaceTokenizer().tokenize(sentence)
    new_tokens = []
    for token in tokens:
        if token == 'Œø,œÑŒπ' or token == 'œå,œÑŒπ' or token == 'o,ti' or token == '√≥,ti':
            new_tokens.append(token.replace(' ', ''))
        else:
            token = re.sub(r'(?<=[.,!\?;\'ŒÑ])(?=[^\s])', r' ', token)
            new_token = token.translate(str.maketrans({key: None for key in punctuation}))
            if (new_token != ''):
                new_tokens.append(new_token)
    sentence =' '.join(new_tokens)
    sentence = sentence.strip(' ')
    return sentence

cg_sents_clean = []
smg_sents_clean = []

for sent in cg_sents:
    cg_sents_clean.append(get_clean_sent_el(sent))
for sent in smg_sents:
    smg_sents_clean.append(get_clean_sent_el(sent))

cg_sents_clean = list(filter(None, cg_sents_clean))
smg_sents_clean = list(filter(None, smg_sents_clean))
cg_sents_clean[:3]

['œÄœÅŒ±œÉŒπŒΩŒø Œ±œÖŒ∫ŒøœÖŒπ ŒºŒµœÇ œÑŒø œÄŒ±œÉœáŒ±ŒªŒπŒΩŒø œÄŒøœÑŒ∑œÅŒπ œÄŒøœÖ ŒµœÄŒπŒ±œÉŒµ Œø ŒºŒπœÑœÉŒ∑œÇ',
 'Œ∫Œ±ŒºŒΩŒøœÖŒΩ œÄŒøŒªŒªŒ± Œ±ŒΩŒ±Œ∫Œ±ŒπŒΩŒπœÉŒ∑ œÉœÑŒ± œÄŒµœÅŒ≤ŒøŒªŒπŒ±',
 'ŒµŒªŒ± œÉœÖŒ≥Œ≥ŒµŒΩŒ∑ œÑŒ∂ŒπŒ±Œπ ŒµœáŒøœÖŒºŒµ ŒΩŒµŒøœÑŒµœÅŒ± œÄ œÑŒø ŒΩŒπŒ∫ŒøŒªŒ∑']

## 3. Building the feature extractor

In [90]:
from nltk import ngrams

def get_word_ngrams(tokens, n):
    ngrams_list = []
    ngrams_list.append(list(ngrams(tokens, n)))
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = '%s'
    for i in range(1, n):
        format_string += (' %s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

def get_char_ngrams(word, n):
    ngrams_list = []
    word = re.sub(r'œÇ', 'œÉ', word)
    ngrams_list.append(list(ngrams(word, n, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_')))
    
    # Removing redundant ngrams:
    if (n > 2):
        redundant_combinations = n - 2
        ngrams_list = [ngram_list[redundant_combinations : -redundant_combinations] for ngram_list in ngrams_list]
    
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = ''
    for i in range(0, n):
        format_string += ('%s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

In [91]:
# Feature extractor
def get_ngram_features(sent): # The reason I do not use NLTK's everygrams to extract the features quickly is because the behavior of my n-gram extractor is modified to remove redundant n-grams. Also, I need to label word and char n-grams to avoid ambiguity
    sentence_tokens = WhitespaceTokenizer().tokenize(sent)
    
    features = {}
    
    # Word unigrams
    ngrams = get_word_ngrams(sentence_tokens, 1)
    for ngram in ngrams:
        features[f'word({ngram})'] = features.get(f'word({ngram})', 0) + 1 # The second parameter to .get() is a default value if the key doesn't exist.
    
    # Word bigrams
    ngrams = get_word_ngrams(sentence_tokens, 2)
    for ngram in ngrams:
        features[f'word_bigram({ngram})'] = features.get(f'word_bigram({ngram})', 0) + 1
    
    # Char unigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 1)
        for ngram in ngrams:
            features[f'char({ngram})'] = features.get(f'char({ngram})', 0) + 1
    
    # Char bigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 2)
        for ngram in ngrams:
            features[f'char_bigram({ngram})'] = features.get(f'char_bigram({ngram})', 0) + 1
    
    # Char trigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 3)
        for ngram in ngrams:
            features[f'char_trigram({ngram})'] = features.get(f'char_trigram({ngram})', 0) + 1
    
    return features

get_ngram_features('Œ±œÖœÑŒ∑ ŒµŒπŒΩŒ±Œπ Œ∑ œÉœÄŒ±œÅœÑŒ∑')

{'char(Œ±)': 3,
 'char(Œµ)': 1,
 'char(Œ∑)': 3,
 'char(Œπ)': 2,
 'char(ŒΩ)': 1,
 'char(œÄ)': 1,
 'char(œÅ)': 1,
 'char(œÉ)': 1,
 'char(œÑ)': 2,
 'char(œÖ)': 1,
 'char_bigram(_Œ±)': 1,
 'char_bigram(_Œµ)': 1,
 'char_bigram(_Œ∑)': 1,
 'char_bigram(_œÉ)': 1,
 'char_bigram(Œ±Œπ)': 1,
 'char_bigram(Œ±œÅ)': 1,
 'char_bigram(Œ±œÖ)': 1,
 'char_bigram(ŒµŒπ)': 1,
 'char_bigram(Œ∑_)': 3,
 'char_bigram(Œπ_)': 1,
 'char_bigram(ŒπŒΩ)': 1,
 'char_bigram(ŒΩŒ±)': 1,
 'char_bigram(œÄŒ±)': 1,
 'char_bigram(œÅœÑ)': 1,
 'char_bigram(œÉœÄ)': 1,
 'char_bigram(œÑŒ∑)': 2,
 'char_bigram(œÖœÑ)': 1,
 'char_trigram(_Œ±œÖ)': 1,
 'char_trigram(_ŒµŒπ)': 1,
 'char_trigram(_Œ∑_)': 1,
 'char_trigram(_œÉœÄ)': 1,
 'char_trigram(Œ±Œπ_)': 1,
 'char_trigram(Œ±œÅœÑ)': 1,
 'char_trigram(Œ±œÖœÑ)': 1,
 'char_trigram(ŒµŒπŒΩ)': 1,
 'char_trigram(ŒπŒΩŒ±)': 1,
 'char_trigram(ŒΩŒ±Œπ)': 1,
 'char_trigram(œÄŒ±œÅ)': 1,
 'char_trigram(œÅœÑŒ∑)': 1,
 'char_trigram(œÉœÄŒ±)': 1,
 'char_trigram(œÑŒ∑_)': 2,
 'char_trigram(œÖœÑŒ∑)': 1,
 'word(Œ

In [92]:
# from nltk import everygrams

# def sent_process(sent):
#     return [''.join(ng) for ng in everygrams(sent.replace(' ', '_ _'), 1, 4) 
#             if ' ' not in ng and '\n' not in ng and ng != ('_',)]

# sent_process('Œ±œÖœÑŒ∑ ŒµŒπŒΩŒ±Œπ Œ∑ œÉœÄŒ±œÅœÑŒ∑')

## 4. Creating the training and test sets

In [93]:
import random

all_sents_labeled = ([(sentence, 'CG') for sentence in cg_sents_clean] + [(sentence, 'SMG') for sentence in smg_sents_clean])
random.shuffle(all_sents_labeled)
all_sents_labeled[0]

('Œ∫Œ±Œπ Œ¥ŒµŒΩ Œ∏ŒµœÇ ŒΩŒ± œÑŒø œáŒ±œÉŒµŒπœÇ ŒºŒµ œÑŒπœÄŒøœÑŒ±', 'SMG')

In [94]:
NO_ALL_SENTENCES = len(all_sents_labeled)
NO_TRAIN_SENTENCES = round(NO_ALL_SENTENCES * .8)

train_set = all_sents_labeled[:NO_TRAIN_SENTENCES]
test_set = all_sents_labeled[NO_TRAIN_SENTENCES:]

train_set_sents = [sent[0] for sent in train_set]
train_set_labels = [sent[1] for sent in train_set]
test_set_sents = [sent[0] for sent in test_set]
test_set_labels = [sent[1] for sent in test_set]

print(train_set_sents[0], train_set_labels[0])

Œ∫Œ±Œπ Œ¥ŒµŒΩ Œ∏ŒµœÇ ŒΩŒ± œÑŒø œáŒ±œÉŒµŒπœÇ ŒºŒµ œÑŒπœÄŒøœÑŒ± SMG


In [95]:
print('DATASET\t', 'SENTENCES')
print('All\t', NO_ALL_SENTENCES)
print('Training', NO_TRAIN_SENTENCES)
print('Testing\t', NO_ALL_SENTENCES - NO_TRAIN_SENTENCES)

DATASET	 SENTENCES
All	 1605
Training 1284
Testing	 321


## 5. Vectorization

In [96]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=get_ngram_features)

train_set_vectors = count_vect.fit_transform(train_set_sents)
test_set_vectors = count_vect.transform(test_set_sents) # Unlike fit_transform(), transform() does not change the count vectorizer's vocabulary so it should be used for the test set.
train_set_vectors

<1284x22024 sparse matrix of type '<class 'numpy.int64'>'
	with 178594 stored elements in Compressed Sparse Row format>

In [97]:
from numpy import set_printoptions, nan
set_printoptions(threshold=nan) # Prints whole array. Required because by default an array with thousands of elements wouldn't be printed in full.

train_set_vectors.toarray()[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [98]:
count_vect.vocabulary_ # The numbers are not counts but indices.

{'word(Œ∫Œ±Œπ)': 7913,
 'word(Œ¥ŒµŒΩ)': 6828,
 'word(Œ∏ŒµœÇ)': 7784,
 'word(ŒΩŒ±)': 8834,
 'word(œÑŒø)': 10265,
 'word(œáŒ±œÉŒµŒπœÇ)': 10639,
 'word(ŒºŒµ)': 8584,
 'word(œÑŒπœÄŒøœÑŒ±)': 10256,
 'word_bigram(Œ∫Œ±Œπ Œ¥ŒµŒΩ)': 14696,
 'word_bigram(Œ¥ŒµŒΩ Œ∏ŒµœÇ)': 12507,
 'word_bigram(Œ∏ŒµœÇ ŒΩŒ±)': 14479,
 'word_bigram(ŒΩŒ± œÑŒø)': 16765,
 'word_bigram(œÑŒø œáŒ±œÉŒµŒπœÇ)': 20953,
 'word_bigram(œáŒ±œÉŒµŒπœÇ ŒºŒµ)': 21827,
 'word_bigram(ŒºŒµ œÑŒπœÄŒøœÑŒ±)': 15973,
 'char(Œ∫)': 50,
 'char(Œ±)': 41,
 'char(Œπ)': 49,
 'char(Œ¥)': 44,
 'char(Œµ)': 45,
 'char(ŒΩ)': 53,
 'char(Œ∏)': 48,
 'char(œÉ)': 58,
 'char(œÑ)': 59,
 'char(Œø)': 55,
 'char(œá)': 62,
 'char(Œº)': 52,
 'char(œÄ)': 56,
 'char_bigram(_Œ∫)': 189,
 'char_bigram(Œ∫Œ±)': 711,
 'char_bigram(Œ±Œπ)': 544,
 'char_bigram(Œπ_)': 681,
 'char_bigram(_Œ¥)': 183,
 'char_bigram(Œ¥Œµ)': 593,
 'char_bigram(ŒµŒΩ)': 618,
 'char_bigram(ŒΩ_)': 767,
 'char_bigram(_Œ∏)': 187,
 'char_bigram(Œ∏Œµ)': 669,
 'char_bigram(ŒµœÉ)': 623,
 'char_bigram(œÉ_)': 8

In [99]:
len(count_vect.vocabulary_) # This is the same as the length of each vector.

22024

## 6. Building the classifiers

In [100]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def show_confusion_matrix(cm):
    print('\t         Predicted')
    print('\t        CG       SMG')
    print('\t     -------- --------')
    print('\tCG  | {:^6} | {:^6}'.format(cm[0][0], cm[1][0]))
    print('Actual\t     -------- --------')
    print('\tSMG | {:^6} | {:^6}'.format(cm[1][0], cm[1][1]))

def show_most_informative_features(vectorizer, clf, n=10):
    print("\t\t    CG\t\t\t\t\t\t    SMG\n")
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%17s\t\t\t%.4f\t%17s" % (coef_1, fn_1, coef_2, fn_2))

### 6.1 Multinomial Naive Bayes classifier

In [101]:
clf_multinomialNB = MultinomialNB() # There are no params for MultinomialDB that prevent overfitting, so any overfitting is caused by the small dataset size.
clf_multinomialNB.fit(train_set_vectors, train_set_labels) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [102]:
clf_multinomialNB_predictions = clf_multinomialNB.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_multinomialNB_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_multinomialNB_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_multinomialNB_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.94 

             precision    recall  f1-score   support

         CG       0.94      0.90      0.92       121
        SMG       0.94      0.96      0.95       200

avg / total       0.94      0.94      0.94       321

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  109   |   7   
Actual	     -------- --------
	SMG |   7    |  193  


In [103]:
show_most_informative_features(count_vect, clf_multinomialNB, n=20)

		    CG						    SMG

	-11.8585	          char(ŒÑ)			-5.1726	          char(Œ±)
	-11.8585	  char_bigram(eŒº)			-5.1928	          char(Œø)
	-11.8585	  char_bigram(eŒæ)			-5.1954	          char(Œπ)
	-11.8585	  char_bigram(yr)			-5.1954	          char(Œµ)
	-11.8585	  char_bigram(zœâ)			-5.2018	          char(œÑ)
	-11.8585	  char_bigram(ŒÑ_)			-5.2304	          char(œÉ)
	-11.8585	  char_bigram(ŒÑŒø)			-5.2545	          char(ŒΩ)
	-11.8585	  char_bigram(Œ¥Œ∫)			-5.2848	  char_bigram(_œÑ)
	-11.8585	  char_bigram(Œ∂ŒÑ)			-5.3160	          char(œÅ)
	-11.8585	  char_bigram(Œ∑Œ±)			-5.3262	  char_bigram(Œø_)
	-11.8585	  char_bigram(Œ∑Œ≤)			-5.3349	  char_bigram(œÑŒø)
	-11.8585	  char_bigram(Œ∑Œ∂)			-5.3379	          char(Œ∫)
	-11.8585	  char_bigram(Œ∑Œ∑)			-5.3662	          char(œÄ)
	-11.8585	  char_bigram(Œ∑Œø)			-5.3769	          char(œÖ)
	-11.8585	  char_bigram(Œ∏Œ∏)			-5.3815	          char(Œº)
	-11.8585	  char_bigram(Œ∏Œ∫)			-5.3831	          char(Œ∑)
	-11.8585	  char_bigram(ŒπŒÑ)			-5.3892

### 6.2 Linear Support Vector classifier

In [17]:
clf_linearSVC = LinearSVC(max_iter=1500) # n_samples < n_features in training set so the dual param is kept at its default value of True. Default max_iter = 1000
clf_linearSVC.fit(train_set_vectors, train_set_labels) 

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1500,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [18]:
clf_linearSVC_predictions = clf_linearSVC.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_linearSVC_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_linearSVC_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_linearSVC_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.92 

             precision    recall  f1-score   support

         CG       0.95      0.96      0.96       122
        SMG       0.69      0.65      0.67        17

avg / total       0.92      0.92      0.92       139

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  117   |   6   
Actual	     -------- --------
	SMG |   6    |   11  


In [19]:
show_most_informative_features(count_vect, clf_linearSVC, n=20)

		    CG						    SMG

	-0.2330	char_trigram(ŒµŒΩ_)			0.1491	  char_bigram(_Œº)
	-0.1910	         word(ŒµŒΩ)			0.1263	char_trigram(_œÉœÑ)
	-0.1718	  char_bigram(_Œµ)			0.1258	  char_bigram(_Œ≤)
	-0.1675	  char_bigram(Œ±Œº)			0.1208	  char_bigram(œÉœÑ)
	-0.1668	  char_bigram(œÉŒπ)			0.1061	  char_bigram(ŒªœÖ)
	-0.1394	  char_bigram(œÑŒ∂)			0.1051	        word(ŒøŒªŒ±)
	-0.1351	char_trigram(_œÑŒ∂)			0.1050	char_trigram(œÉŒ±Œπ)
	-0.1336	          char(Œø)			0.1027	char_trigram(_ŒªŒµ)
	-0.1258	char_trigram(Œ∫Œ±Œº)			0.0978	          char(Œ¥)
	-0.1165	          char(Œµ)			0.0962	char_trigram(ŒπœÉœÑ)
	-0.1148	  char_bigram(œâ_)			0.0936	char_trigram(ŒµœÑŒµ)
	-0.1127	  char_bigram(Œ∏Œ∫)			0.0934	  char_bigram(_Œ±)
	-0.1101	char_trigram(_ŒªŒ±)			0.0934	char_trigram(_Œ≤œÅ)
	-0.1073	char_trigram(_ŒµŒΩ)			0.0908	  char_bigram(ŒπœÉ)
	-0.1033	  char_bigram(ŒµŒΩ)			0.0895	char_trigram(œÑŒµ_)
	-0.0992	char_trigram(_œÑŒµ)			0.0889	char_trigram(ŒπŒΩŒ±)
	-0.0947	          char(œÑ)			0.0883	char_trigram(_

### 6.3 Logistic Regression classifier

In [20]:
clf_logisticRegression = LogisticRegression() # Again, dual = True. Default solver = 'liblinear'. It's recommended for smaller databases. For bigger databases, 'saga' could be used.
clf_logisticRegression.fit(train_set_vectors, train_set_labels) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
clf_logisticRegression_predictions = clf_logisticRegression.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_logisticRegression_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_logisticRegression_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_logisticRegression_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.94 

             precision    recall  f1-score   support

         CG       0.95      0.98      0.96       122
        SMG       0.79      0.65      0.71        17

avg / total       0.93      0.94      0.93       139

	         Predicted
	        CG       SMG
	     -------- --------
	CG  |  119   |   6   
Actual	     -------- --------
	SMG |   6    |   11  


**It seems that the classification algorithm with the best performance is *Linear Support Vector Machines***.

## 7. Analyzing misclassifications made by the Linear Support Vector classifier

In [22]:
print('MISCLASSIFICATIONS\n')

misclassificationCount = 0

for i, sent in enumerate(test_set_sents):
    if test_set_labels[i] != clf_linearSVC_predictions[i]:
        misclassificationCount += 1
        print(f'{misclassificationCount}.', sent, f'(CORRECT = {test_set_labels[i]},', f'PREDICTED = {clf_linearSVC_predictions[i]})\n')

MISCLASSIFICATIONS

1. Œ¥Œ∑ŒªŒ±Œ¥Œ∑ ŒµœÉŒ±œÇ œÑŒ∂ŒπŒ±Œπ Œ≤ŒπŒªŒªŒø ŒΩŒ± œÉŒ±œÇ œÄœÅŒøœÉœÜŒµœÅŒøœÖŒΩ Œ≥ŒπŒ± ŒªœÖœÉŒ∑ œÄŒ±ŒªŒπ ŒΩŒ±Œπ Œ∏Œ± œÄŒµŒπœÑŒµ Œ±œÑŒµ (CORRECT = CG, PREDICTED = SMG)

2. ŒµŒ∫Œ±œÑœÉŒµ Œø œÖœÄŒøœÖœÅŒ≥ŒøœÇ ŒºŒµ œÑŒµœÉœÉŒµœÅŒøŒπœÇ Œ≤ŒªŒ±Œ∫ŒµœÇ œÄŒøœÖ Œ∫Œ±ŒºŒΩŒøœÖŒΩ œÄœâœÇ Œ∫Œ±œÑŒ±ŒªŒ±Œ≤ŒøœÖŒΩ œÄŒøœÖ Œ∫œÖŒΩŒ∑Œ≥Œ∑ Œ∫Œ±Œπ Œø œÖœÄŒøœÖœÅŒ≥ŒøœÇ ŒµŒ∫Œ±ŒºŒΩŒµ œÄœâœÇ Œ∑ŒæŒµœÅŒµ œÄŒøœÖ Œ∫œÖŒΩŒ∑Œ≥Œ∑ œÄœâœÇ Œ∫Œ±œÑŒ±ŒªŒ±Œ≤ŒµŒπ œÑŒøœÖœÑŒøŒπ ŒøœÖŒªŒøŒπ œÄŒªŒ∑œÅœâŒΩŒøŒΩœÑŒµ ŒºŒµ ŒµŒΩŒ± œÉŒøœÅŒø ŒªŒµœÜœÑŒ± œÄŒøœÖ ŒºœÄŒøœÅŒøœÖŒΩ ŒΩŒ± Œ∂Œ∑œÉŒøœÖŒΩ œÄŒøŒªŒªŒµœÇ ŒøŒπŒ∫ŒøŒ≥ŒµŒΩŒµŒπœÇ œÑŒ∂Œµ Œ∑Œ≤œÅŒ±ŒΩ œÑŒ∑ŒΩ ŒªœÖœÉŒ∑ Œ≥ŒπŒ± ŒΩŒ± œÉœâœÉŒøœÖŒΩ œÑŒ∑ŒΩ Œ∫Œ±œÑŒ±œÉœÑŒ±œÉŒ∑ ŒΩŒ± Œ∫ŒøœàŒøœÖŒºŒµ œÑŒµœÉœÉŒµœÅŒπœÇ ŒµŒæŒøœÅŒºŒ∑œÉŒµŒπœÇ Œ∫Œ±Œπ ŒµŒªœÖŒ∏Œ∑Œ∫Œµ œÑŒø œÄœÅŒøŒ≤ŒªŒ∑ŒºŒ± (CORRECT = CG, PREDICTED = SMG)

3. ŒµŒΩŒ±œÇ Œ¥ŒπŒ∫ŒøœÇ ŒºŒøœÖ Œ∫œÖœÄœÅŒ±ŒπŒøœÇ œÄœÅŒµœÄŒµŒπ ŒΩŒ± œÑŒø Œ¥ŒπŒ±Œ≤Œ±œÉŒµŒπ ŒøœÄœâœÉŒ¥Œ∑œÄŒøœÑŒµ Œ∏Œ± œÑœÅŒµŒªŒ±Œ∏ŒµŒπ (CORRECT = SMG, PREDICTED = CG)

4. Œ±ŒπœÉŒ∏Œ∑œÑŒ∑ Œ∑ Œ≤ŒµŒªœÑŒπ

## 8. Trying the Linear Support Vector classifier with custom input

In [24]:
cgSent = 'Œó ŒöœçœÄœÅŒøœÇ ŒµŒΩ œÄŒøœÖ œÑŒµœÇ œÄŒπŒø œåŒºŒøœÅœÜŒµœÇ œáœéœÅŒµœÇ.'
smgSent = 'Œó ŒöœçœÄœÅŒøœÇ ŒµŒØŒΩŒ±Œπ Œ±œÄœå œÑŒπœÇ œÄŒπŒø œåŒºŒøœÅœÜŒµœÇ œáœéœÅŒµœÇ.'

cgSent = get_clean_sent_el(cgSent)
smgSent = get_clean_sent_el(smgSent)

test_vec = count_vect.transform([cgSent, smgSent])
clf_linearSVC.predict(test_vec)

array(['CG', 'SMG'], dtype='<U3')