# I) General

## 1. Importations

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import codecs
import re
import os.path
import random
import string
import re
import unicodedata
from time import time
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import sklearn.naive_bayes as nb
from sklearnex import patch_sklearn
patch_sklearn() # Should speed up learning somehow 

from wordcloud import WordCloud

import warnings
warnings.filterwarnings("ignore")

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# II) First data set french presidents

In [2]:
def load_pres(fname):
    alltxts = []
    alllabs = []
    s=codecs.open(fname, 'r','utf-8')
    while True:
        txt = s.readline()
        if(len(txt))<5:
            break
        lab = re.sub(r"<[0-9]*:[0-9]*:(.)>.*","\\1",txt)
        txt = re.sub(r"<[0-9]*:[0-9]*:.>(.*)","\\1",txt)
        if lab.count('M') >0:
            alllabs.append(-1)
        else: 
            alllabs.append(1)
        alltxts.append(txt)
    return alltxts,alllabs

## 2. Features and model selection

In [3]:
def naive_balancing(X, Y):
    label, count = np.unique(Y, return_counts=True)
    idx_pos = np.where(Y == 1, True, False)
    Y_pos = Y[idx_pos]
    Y_pos = Y_pos[:count.min()]
    idx_neg = np.where(Y == -1, True, False)
    Y_neg = Y[idx_neg]
    new_Y = np.concatenate((Y_pos, Y_neg))
    new_X = np.concatenate((X[:count.min()], X[idx_neg]))    
    tmp = list(zip(new_X, new_Y))
    random.shuffle(tmp)
    new_X, new_Y = zip(*tmp)  
    label, count = np.unique(new_Y, return_counts=True)
    print(label, count)
    return new_X, new_Y

def get_all_data_vectorized(X, X_test, Y, Y_test, vectorizer, transformer=None):
    X_vector = vectorizer.fit_transform(X)
    if transformer is not None:
        transformer = transformer.fit(X_vector)
        X_final = transformer.transform(X_vector)
    else:
        X_final = X_vector
    X_test_vector = vectorizer.transform(X_test)
    return X_final, X_test_vector, Y, Y_test

def get_inf_acc(predictions, Y_test):
    idx_inf = np.where(predictions==-1, True, False)
    tmp_pred = predictions[idx_inf]
    tmp_real = Y_test[idx_inf]
    cpt = 0
    for i in range(len(tmp_pred)):
        if tmp_pred[i] == tmp_real[i]:
            cpt += 1
    return np.round((cpt/len(tmp_pred))*100, 2)

def get_sup_acc(predictions, Y_test):
    idx_inf = np.where(predictions==1, True, False)
    tmp_pred = predictions[idx_inf]
    tmp_real = Y_test[idx_inf]
    cpt = 0
    for i in range(len(tmp_pred)):
        if tmp_pred[i] == tmp_real[i]:
            cpt += 1
    return np.round((cpt/len(tmp_pred))*100, 2)

def display_infos(clf, X_test, Y_test):
    # Check le nombre de predictions pour chaque label
    predictions = clf.predict(X_test)
    unique, counts = np.unique(predictions, return_counts=True)
    print("/!\ Prediction counts for label ", unique, " --> ", counts)
    # Check le nombre de predictions pour chaque label
    unique, counts = np.unique(Y_test, return_counts=True)
    print("/!\ Ground truth counts for label ", unique, " --> ", counts)
    # Check la precision du label en inferiorite
    acc = get_inf_acc(predictions, Y_test)
    print("/!\ Accuracy of inferior label :", acc, "%")
    # Check la precision du label en superiorite
    acc = get_sup_acc(predictions, Y_test)
    print("/!\ Accuracy of superior label :", acc, "%\n")
    
def display_model_scores(model, X_test, Y_test, search=True):
    if search:
        best_parameters = model.best_estimator_.get_params()
        for param_name in sorted(best_parameters.keys()):
            print("--->%s: %r" % (param_name, best_parameters[param_name]))
        # print(model.best_score_)
    grid_predictions = model.predict(X_test)
    print(confusion_matrix(Y_test, grid_predictions))
    print(classification_report(Y_test, grid_predictions))
    
def stemm(X):
    stemmer = SnowballStemmer("french")
    stem = stemmer.stem(X)
    return X

### --> Loading and splitting data

In [87]:
fname = "./data/corpus.tache1.learn.utf8"
alltxts, alllabs = load_pres(fname)
X = np.array(alltxts)
Y = np.array(alllabs)
# new_X, new_Y = naive_balancing(X,Y) # not a good idea in this project
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)
train_size = len(X_train)
test_size = len(X_test)
print(train_size)
print(test_size)

34447
22966


### --> Stemming

In [88]:
# print(len(X_train))
# stemmer = snowballstemmer.stemmer('french');
# for i in range(len(X_train)):
#     stemmer.stemWords(X_train[i].split()[j] for j in range(len(X_train[i].split())))
# print(np.array(X_train).shape)
# for i in range(len(X_test)):
#     stemmer.stemWords(X_test[i].split()[j] for j in range(len(X_test[i].split())))

### --> Feature selection depending the model selected

In [89]:
search = False
sw = stopwords.words('french')

if search :
    
    pipeline_svc = Pipeline(
        [
            ("vect", TfidfVectorizer(max_features=100_000)), # avoids overfit on train set (having more features than individuals is bad most of the time)
            ("clf", LinearSVC(class_weight="balanced",       # balanced class weight parameter is mandatory in our case
                              max_iter=1000,                 # also avoids overfit
                              C=100))                        # regularization so the model doesn't predict only one class  
        ]
    )
    # pipeline_nb = Pipeline(
    #     [
    #         ("vect", TfidfVectorizer(max_features=80_000)),
    #         ("clf", MultinomialNB())      
    #     ]
    # )
    # pipeline_lr = Pipeline(
    #     [
    #         ("vect", TfidfVectorizer(max_features=50_000)),
    #         ("clf", LogisticRegression(class_weight="balanced",
    #                                    max_iter=1000,
    #                                    C=100))        
    #     ]
    # )
    parameters_vectorizer = {
        "vect__lowercase": (False,),
        "vect__stop_words": (None,),
        "vect__strip_accents": ("ascii",),
        "vect__use_idf": (True,),
        "vect__smooth_idf": (False,),
        "vect__sublinear_tf": (True,),
        
        "vect__min_df": (1,),
        "vect__max_df": np.arange(0.4,0.6,0.05),
        "vect__ngram_range": [(1, 2),],
        }
    
    strat_kfold = StratifiedKFold(n_splits=3, shuffle=True)

    grid_search_parameters_svc = GridSearchCV(pipeline_svc, 
                                              parameters_vectorizer, 
                                              scoring="f1_micro", # test "rog_auc", "f1_micro"
                                              n_jobs=8, 
                                              verbose=3, 
                                              cv=strat_kfold,
                                              refit=True
                                              )
    # grid_search_parameters_nb = GridSearchCV(pipeline_nb, 
    #                                           parameters_vectorizer, 
    #                                           scoring="f1",
    #                                           n_jobs=8, 
    #                                           verbose=3, 
    #                                           cv=strat_kfold,
    #                                           refit=True
    #                                           )
    # grid_search_parameters_lr = GridSearchCV(pipeline_lr, 
    #                                           parameters_vectorizer, 
    #                                           scoring="f1", 
    #                                           n_jobs=8, 
    #                                           verbose=3, 
    #                                           cv=strat_kfold,
    #                                           refit=True
    #                                           )
    
    print("============================================= SVC + VECTORIZER =============================================")
    t0 = time()
    grid_search_parameters_svc.fit(X_train, Y_train)
    print("done in %0.3fs" % (time() - t0))
    display_model_scores(grid_search_parameters_svc, X_train, Y_train) # check overfit X_train
    display_model_scores(grid_search_parameters_svc, X_test, Y_test)
    
#     print("============================================= NB + VECTORIZER =============================================")
#     t0 = time()
#     grid_search_parameters_nb.fit(X_train, Y_train)
#     print("done in %0.3fs" % (time() - t0))
#     display_model_scores(grid_search_parameters_nb, X_train, Y_train)
#     display_model_scores(grid_search_parameters_nb, X_test, Y_test) 
    
    # print("============================================= LR + VECTORIZER =============================================")
    # t0 = time()
    # grid_search_parameters_lr.fit(X_train, Y_train)
    # print("done in %0.3fs" % (time() - t0))
    # display_model_scores(grid_search_parameters_lr, X_train, Y_train)
    # display_model_scores(grid_search_parameters_lr, X_test, Y_test) 

In [90]:
# display_model_scores(grid_search_parameters_svc, X_test, Y_test)
# display_model_scores(grid_search_parameters_nb, X_test, Y_test)
# display_model_scores(grid_search_parameters_lr, X_test, Y_test)

### --> Building optimal vectorizer for each model

In [91]:
vectorizer_svc = TfidfVectorizer(lowercase=False,       # Almost false everytime in our case
                                 stop_words=None,       # Never remove stopwords
                                 strip_accents="ascii", # Sometimes none sometimes ascii, doesn't really matter
                                 use_idf=True,          # Always true
                                 smooth_idf=False,      # Sometimes true sometimes false, doesn't really matter
                                 sublinear_tf=True,     # Sometimes true sometimes false, doesn't really matter
                             
                                 max_features=100_000,  # Avoids overfit and reduce noise
                                 min_df=1,              # Always 1, meaning we don't remove anything
                                 max_df=0.55,           # One of the most meaningful parameters, can be 0.05, 0.9 ...
                                 ngram_range=(1,2),     # Always unigram + bigram
                                 )

vectorizer_nb = TfidfVectorizer(lowercase=False,
                                stop_words=None,
                                strip_accents=None,
                                use_idf=True,
                                smooth_idf=True,
                                sublinear_tf=False,
                             
                                max_features=100_000,
                                min_df=15,
                                max_df=0.05,
                                ngram_range=(1,2),
                                )

vectorizer_lr = TfidfVectorizer(lowercase=False,
                                stop_words=None,
                                strip_accents="ascii",
                                use_idf=True,
                                smooth_idf=False,
                                sublinear_tf=False,
                             
                                max_features=100_000,
                                min_df=1,
                                max_df=0.5,
                                ngram_range=(1,2),
                                )

# Remove comment for stemming
# print(len(X_train))
# stemmer = snowballstemmer.stemmer('french');
# for i in range(len(X_train)):
#     stemmer.stemWords(X_train[i].split()[j] for j in range(len(X_train[i].split())))
# print(np.array(X_train).shape)

X_train_vector_svc, X_test_vector_svc, Y_train_svc, Y_test_svc = get_all_data_vectorized(X_train, X_test, Y_train, Y_test, vectorizer_svc)
print(X_train_vector_svc.shape)
X_train_vector_nb, X_test_vector_nb, Y_train_nb, Y_test_nb = get_all_data_vectorized(X_train, X_test, Y_train, Y_test, vectorizer_nb)
print(X_train_vector_nb.shape)
X_train_vector_lr, X_test_vector_lr, Y_train_lr, Y_test_lr = get_all_data_vectorized(X_train, X_test, Y_train, Y_test, vectorizer_lr)
print(X_train_vector_lr.shape)

(34447, 100000)
(34447, 9927)
(34447, 100000)


### --> Model selection SVC

In [92]:
search = False
if search:
    parameters = {
        'C': np.arange(3.2,4,0.05)  
    }
    
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True)

    optimal_svc = GridSearchCV(LinearSVC(class_weight="balanced",
                                         max_iter=2000),
                               parameters, 
                               scoring="f1",
                               n_jobs=8, 
                               verbose=3, 
                               cv=strat_kfold,
                               refit=True
                               )

    t0 = time()
    optimal_svc.fit(X_train_vector_svc, Y_train_svc)
    print("done in %0.3fs" % (time() - t0))
    
    display_model_scores(optimal_svc, X_train_vector_svc, Y_train_svc)
    display_model_scores(optimal_svc, X_test_vector_svc, Y_test_svc)

### --> Model selection NB

In [93]:
search = False
if search:
    parameters = {
        'alpha': np.arange(0, 1, 0.05),
    }
    
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True)

    optimal_nb = GridSearchCV(MultinomialNB(),
                              parameters, 
                              scoring="f1",
                              n_jobs=8, 
                              verbose=3, 
                              cv=strat_kfold,
                              refit=True
                              )

    t0 = time()
    optimal_nb.fit(X_train_vector_nb, Y_train_nb)
    print("done in %0.3fs" % (time() - t0))
    
    display_model_scores(optimal_nb, X_train_vector_nb, Y_train_nb)
    display_model_scores(optimal_nb, X_test_vector_nb, Y_test_nb)

### --> Model selection LR

In [94]:
search = False
if search:
    parameters = {
        'C': (40,44,46) 
    }
    
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True)

    optimal_lr = GridSearchCV(LogisticRegression(class_weight="balanced",
                                                      max_iter=2000),
                               parameters, 
                               scoring="f1",
                               n_jobs=8, 
                               verbose=3, 
                               cv=strat_kfold,
                               refit=True
                               )

    t0 = time()
    optimal_lr.fit(X_train_vector_lr, Y_train_lr)
    print("done in %0.3fs" % (time() - t0))
    
    display_model_scores(optimal_lr, X_train_vector_lr, Y_train_lr)
    display_model_scores(optimal_lr, X_test_vector_lr, Y_test_lr)

### --> Test 3 optimal models without max_iter

In [95]:
# final_svc = LinearSVC(class_weight="balanced",
#                       max_iter=10_000,
#                       C=1.15)

final_svc = LinearSVC(class_weight="balanced",
                      max_iter=10_000,
                      C=3.95)

t0 = time()
final_svc.fit(X_train_vector_svc, Y_train_svc)
print("done in %0.3fs" % (time() - t0))
    
display_model_scores(final_svc, X_train_vector_svc, Y_train_svc, search=False)
display_model_scores(final_svc, X_test_vector_svc, Y_test_svc, search=False)

done in 1.165s
[[ 5264     2]
 [    8 29173]]
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      5266
           1       1.00      1.00      1.00     29181

    accuracy                           1.00     34447
   macro avg       1.00      1.00      1.00     34447
weighted avg       1.00      1.00      1.00     34447

[[ 1325   932]
 [ 1328 19381]]
              precision    recall  f1-score   support

          -1       0.50      0.59      0.54      2257
           1       0.95      0.94      0.94     20709

    accuracy                           0.90     22966
   macro avg       0.73      0.76      0.74     22966
weighted avg       0.91      0.90      0.91     22966



In [96]:
final_nb = MultinomialNB(alpha=0.1)

t0 = time()
final_nb.fit(X_train_vector_nb, Y_train_nb)
print("done in %0.3fs" % (time() - t0))
    
display_model_scores(final_nb, X_train_vector_nb, Y_train_nb, search=False)
display_model_scores(final_nb, X_test_vector_nb, Y_test_nb, search=False)

done in 0.008s
[[ 2436  2830]
 [  382 28799]]
              precision    recall  f1-score   support

          -1       0.86      0.46      0.60      5266
           1       0.91      0.99      0.95     29181

    accuracy                           0.91     34447
   macro avg       0.89      0.72      0.77     34447
weighted avg       0.90      0.91      0.89     34447

[[  764  1493]
 [  414 20295]]
              precision    recall  f1-score   support

          -1       0.65      0.34      0.44      2257
           1       0.93      0.98      0.96     20709

    accuracy                           0.92     22966
   macro avg       0.79      0.66      0.70     22966
weighted avg       0.90      0.92      0.90     22966



In [97]:
final_lr = LogisticRegression(class_weight="balanced",
                              max_iter=10_000,
                              C=44)

t0 = time()
final_lr.fit(X_train_vector_lr, Y_train_lr)
print("done in %0.3fs" % (time() - t0))
    
display_model_scores(final_lr, X_train_vector_lr, Y_train_lr, search=False)
display_model_scores(final_lr, X_test_vector_lr, Y_test_lr, search=False)

done in 4.627s
[[ 5264     2]
 [   12 29169]]
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      5266
           1       1.00      1.00      1.00     29181

    accuracy                           1.00     34447
   macro avg       1.00      1.00      1.00     34447
weighted avg       1.00      1.00      1.00     34447

[[ 1408   849]
 [ 1476 19233]]
              precision    recall  f1-score   support

          -1       0.49      0.62      0.55      2257
           1       0.96      0.93      0.94     20709

    accuracy                           0.90     22966
   macro avg       0.72      0.78      0.75     22966
weighted avg       0.91      0.90      0.90     22966



### --> Testing voting classifier test = False

In [98]:
test = False

In [99]:
svc_for_vote = SVC(kernel='linear', # VotingClassifier doesn't work with linearSVC() module if voting="soft"
                   probability=True,
                   class_weight="balanced",
                   max_iter=10_000,
                   C=3.95,
                   verbose=3)

voting_clf_soft = VotingClassifier(estimators=[
                                            ('lr', final_lr), 
                                            # ('nb', final_nb),
                                            ('svc', svc_for_vote)
                                            ],
                                  voting='soft',
                                  n_jobs=1,
                                  verbose=3)

if test:
    t0 = time()
    voting_clf_soft.fit(X_train_vector_svc, Y_train_svc)
    print("done in %0.3fs" % (time() - t0))

    display_model_scores(voting_clf_soft, X_train_vector_svc, Y_train_svc, search=False)
    display_model_scores(voting_clf_soft, X_test_vector_svc, Y_test_svc, search=False)

In [100]:
voting_clf_hard = VotingClassifier(estimators=[
                                            ('lr', final_lr), 
                                            # ('nb', final_nb),
                                            ('svc', final_svc)
                                            ],
                                  voting='hard',
                                  n_jobs=1)

if test:
    t0 = time()
    voting_clf_hard.fit(X_train_vector_svc, Y_train_svc)
    print("done in %0.3fs" % (time() - t0))

    display_model_scores(voting_clf_hard, X_train_vector_svc, Y_train_svc, search=False)
    display_model_scores(voting_clf_hard, X_test_vector_svc, Y_test_svc, search=False)

## 3. Final training on whole train dataset

In [101]:
if test:
    fname = "./data/corpus.tache1.learn.utf8"
    alltxts_train_final, alllabs_train_final = load_pres(fname)
    X_train_final = np.array(alltxts_train_final)
    Y_train_final = np.array(alllabs_train_final)

In [102]:
if test:
    X_final_vector_svc = vectorizer_svc.fit_transform(X_train_final)
    X_final_vector_nb = vectorizer_nb.fit_transform(X_train_final)
    X_final_vector_lr = vectorizer_lr.fit_transform(X_train_final)

    t0 = time()
    final_svc.fit(X_final_vector_svc,  Y_train_final)
    print(" done in %0.3fs" % (time() - t0))

    t0 = time()
    final_nb.fit(X_final_vector_nb,  Y_train_final)
    print(" done in %0.3fs" % (time() - t0))

    t0 = time()
    final_lr.fit(X_final_vector_lr,  Y_train_final)
    print(" done in %0.3fs" % (time() - t0))

    t0 = time()
    voting_clf_soft.fit(X_final_vector_svc,  Y_train_final)
    print(" done in %0.3fs" % (time() - t0))

    t0 = time()
    voting_clf_hard.fit(X_final_vector_svc,  Y_train_final)
    print(" done in %0.3fs" % (time() - t0))

## 4. Computing final test predicitons

In [103]:
test = False

In [104]:
if test:
    fname ="./data/corpus.tache1.test.utf8"
    alltxts_test_final, _ = load_pres(fname)
    X_test_final = np.array(alltxts_test_final)

In [105]:
if test:
    X_test_vector_svc = vectorizer_svc.transform(X_test_final)
    final_pred_svc = final_svc.predict(X_test_vector_svc)

    X_test_vector_nb = vectorizer_nb.transform(X_test_final)
    final_pred_nb = final_nb.predict(X_test_vector_nb)

    X_test_vector_lr = vectorizer_lr.transform(X_test_final)
    final_pred_lr = final_lr.predict(X_test_vector_lr)

    X_test_vector_soft = vectorizer_svc.transform(X_test_final)
    final_pred_soft = voting_clf_soft.predict(X_test_vector_soft)

    X_test_vector_hard = vectorizer_svc.transform(X_test_final)
    final_pred_hard = voting_clf_hard.predict(X_test_vector_hard)

In [106]:
if test:
    print(final_pred_svc.shape)
    print(final_pred_nb.shape)
    print(final_pred_lr.shape)
    print(final_pred_soft.shape)
    print(final_pred_hard.shape)

## 5. Post processing

### --> Train our best model with our best vectorizer and check our f1 score before postprocessing

In [107]:
vectorizer_svc = TfidfVectorizer(lowercase=False,
                                 stop_words=None,
                                 strip_accents="ascii",
                                 use_idf=True,
                                 smooth_idf=False,
                                 sublinear_tf=True,
                             
                                 max_features=100_000,
                                 min_df=1,
                                 max_df=0.55,
                                 ngram_range=(1,2),
                                 )

X_train_vector_svc, X_test_vector_svc, Y_train_svc, Y_test_svc = get_all_data_vectorized(X_train, X_test, Y_train, Y_test, vectorizer_svc)
print(X_train_vector_svc.shape)
X_train_vector_lr, X_test_vector_lr, Y_train_lr, Y_test_lr = get_all_data_vectorized(X_train, X_test, Y_train, Y_test, vectorizer_lr)
print(X_train_vector_lr.shape)

final_svc = LinearSVC(class_weight="balanced",
                      max_iter=10_000,
                      C=3.95)

final_lr = LogisticRegression(class_weight="balanced",
                              max_iter=10_000,
                              C=44)

voting_clf_soft = VotingClassifier(estimators=[
                                            ('lr', final_lr), 
                                            # ('nb', final_nb),
                                            ('svc', svc_for_vote)
                                            ],
                                  voting='soft',
                                  n_jobs=1,
                                  verbose=3)

t0 = time()
final_svc.fit(X_train_vector_svc, Y_train_svc)
print("done in %0.3fs" % (time() - t0))

t0 = time()
final_lr.fit(X_train_vector_lr, Y_train_lr)
print("done in %0.3fs" % (time() - t0))

# t0 = time()
# voting_clf_soft.fit(X_train_vector_svc, Y_train_svc)
# print("done in %0.3fs" % (time() - t0))
    
display_model_scores(final_svc, X_test_vector_svc, Y_test_svc, search=False)
display_model_scores(final_lr, X_test_vector_lr, Y_test_lr, search=False)
# display_model_scores(voting_clf_soft, X_test_vector_svc, Y_test_svc, search=False)

(34447, 100000)
(34447, 100000)
done in 1.285s
done in 4.525s
[[ 1325   932]
 [ 1328 19381]]
              precision    recall  f1-score   support

          -1       0.50      0.59      0.54      2257
           1       0.95      0.94      0.94     20709

    accuracy                           0.90     22966
   macro avg       0.73      0.76      0.74     22966
weighted avg       0.91      0.90      0.91     22966

[[ 1408   849]
 [ 1476 19233]]
              precision    recall  f1-score   support

          -1       0.49      0.62      0.55      2257
           1       0.96      0.93      0.94     20709

    accuracy                           0.90     22966
   macro avg       0.72      0.78      0.75     22966
weighted avg       0.91      0.90      0.90     22966



### --> Preprocessing optimization functions

In [108]:
def postprocessing(pred, window_size, Y_test, threshold):
    uni, cpt = np.unique(pred, return_counts=True)
    prob_mit = cpt[0]/len(pred)
    prob_chi = cpt[1]/len(pred)
    new_final_pred = pred.copy()
    # Hard smoothing until it converges
    while True: 
        new_final_pred_tmp = hard_smoothing(pred, new_final_pred, window_size, threshold)
        if np.array_equiv(new_final_pred_tmp, new_final_pred):
            break
    new_final_pred = new_final_pred_tmp
    # Making sure to delete all solo values
    while True: 
        new_final_pred_tmp = soft_smoothing(new_final_pred)
        if np.array_equiv(new_final_pred_tmp, new_final_pred):
            break
    # Re balance minority class
    # new_final_pred = cheat_smoothing(new_final_pred_tmp, add)
    # return new_final_pred, float(classification_report(new_final_pred, Y_test)[90:100]) # Trick to get the minority class f1 score
    return new_final_pred, f1_score(new_final_pred, Y_test, average="binary")

# Smoothing values with a rolling window
def hard_smoothing(pred, new_final_pred, window_size, threshold):
    for i in range(window_size, len(new_final_pred) - window_size):
        neighbors = pred[int(i-window_size/2):int(i+window_size/2)] # Here we need to use pred and now new_final_pred_svc !!!
        unique, counts = np.unique(neighbors, return_counts=True)

        if len(unique) == 1:
            if unique[0] == 1:
                unique = np.append(unique,-1)
                counts = np.append(counts,0)
            else:
                unique = np.append(unique,1)
                counts = np.append(counts,0)
                
        # Probability smoothing learning
        if counts[0]/(counts[0]+counts[1]) > threshold: # This one seems useless after testing results
            new_final_pred[i] = int(unique[0])
        else:
            new_final_pred[i] = int(unique[1])
        if counts[1]/(counts[0]+counts[1]) > threshold: # Learning a threshold there instead of using prob_chi might be an error
            new_final_pred[i] = int(unique[1])
        else:
            new_final_pred[i] = int(unique[0])
        
    return new_final_pred

# Smoothing solo values
def soft_smoothing(pred):
    for i in range(1, len(pred)-1):
        if (pred[i-1] == -1 and pred[i+1] == -1):
            pred[i] = -1
        elif (pred[i-1] == 1 and pred[i+1] == 1):
            pred[i] = 1
    return pred

# Smoothing tri values
def soft_smoothing_3(pred):
    for i in range(3, len(pred)-3):
        if (pred[i-3:i-2] and pred[i+2:i+3] == -1):
            pred[i-1:i+1] = -1
        elif (pred[i-3:i-2] == 1 and pred[i+2:i+3] == 1):
            pred[i-1:i+1] = 1
    return pred

# Rebalance our minority class
# Seems like this one is useless because the hard_smoothing function does kind of the same job
def cheat_smoothing(pred, add):
    for _ in range(add):
        for i in range(add, len(pred)-add):
            if pred[i] == 1 and pred[i+1] == -1:
                pred[i] = -1
            elif pred[i] == -1 and pred[i+1] == 1:
                pred[i+1] = -1
    return pred

def optimize_postprocessing(preds, Y_test):
    maxi = 0
    for window_size in tqdm(range(8,13,1)):
        for threshold in np.arange(0,1,0.1):
            new_preds, score = postprocessing(preds, window_size, Y_test, threshold)
            if score > maxi:
                print("Update score :", score, "for ws:", window_size, "and threshold:", threshold)
                maxi = score
                best_ws = window_size
                best_thresh = threshold
                best_preds = new_preds
    return maxi, best_ws, best_thresh, best_preds

def final_postprocessing_test(pred, window_size, threshold):
    uni, cpt = np.unique(pred, return_counts=True)
    prob_mit = cpt[0]/len(pred)
    prob_chi = cpt[1]/len(pred)
    new_final_pred = pred.copy()
    new_final_pred = soft_smoothing(new_final_pred)
    new_final_pred = hard_smoothing(pred, new_final_pred, window_size, threshold)
    new_final_pred = soft_smoothing(new_final_pred)
    return new_final_pred

def final_postprocessing(pred, window_size, threshold):
    new_final_pred = pred.copy()
    # new_final_pred = soft_smoothing(new_final_pred) # No idea if it's a good idea
    # Hard smoothing until it converges
    while True: 
        new_final_pred_tmp = hard_smoothing(pred, new_final_pred, window_size, threshold)
        if np.array_equiv(new_final_pred_tmp, new_final_pred):
            break
    new_final_pred = new_final_pred_tmp
    # Making sure to delete all solo values
    while True: 
        new_final_pred_tmp = soft_smoothing(new_final_pred)
        if np.array_equiv(new_final_pred_tmp, new_final_pred):
            break
    new_final_pred = new_final_pred_tmp
    return new_final_pred

## 6. Computing final test predictions after postprocessing

### --> Find our best model for the smoothing

In [None]:
# Make crossvalidation to get better parameter estimation
# The window should slide on both side and compare the values
# Test before hard smoothing :
# while True: 
#         new_final_pred_tmp = soft_smoothing(new_final_pred)
#         if np.array_equiv(new_final_pred_tmp, new_final_pred):
#             break
#     new_final_pred = new_final_pred_tmp

In [109]:
X_test_vector_final = X_test_vector_svc
Y_test_final = Y_test_svc

maxi_svc, best_ws_svc, best_thresh_svc, best_preds_svc = optimize_postprocessing(final_svc.predict(X_test_vector_final), Y_test_final)

# print("===========================================================================================")

# maxi_soft, best_ws_soft, best_thresh_soft, best_preds_soft = optimize_postprocessing(voting_clf_soft.predict(X_test_vector_final), Y_test_final) 

  0%|          | 0/5 [00:00<?, ?it/s]

Update score : 0.9492574257425742 for ws: 8 and threshold: 0.0
Update score : 0.9531239212978944 for ws: 8 and threshold: 0.2
Update score : 0.9609153785452352 for ws: 8 and threshold: 0.30000000000000004
Update score : 0.9703969763128859 for ws: 8 and threshold: 0.4
Update score : 0.9762970014278914 for ws: 8 and threshold: 0.5


 60%|██████    | 3/5 [00:17<00:11,  5.80s/it]

Update score : 0.9768339768339769 for ws: 11 and threshold: 0.6000000000000001


 80%|████████  | 4/5 [00:23<00:05,  5.70s/it]

Update score : 0.9776117616071642 for ws: 12 and threshold: 0.6000000000000001


100%|██████████| 5/5 [00:28<00:00,  5.75s/it]


### --> Displaying our f1 score after postprocessing

In [27]:
print("best parameters:", maxi_svc, best_ws_svc, best_thresh_svc, best_preds_svc)
print("------------------------------before------------------------------")
print(classification_report(final_svc.predict(X_test_vector_final), Y_test_final))
print("------------------------------after------------------------------")
print(classification_report(best_preds_svc, Y_test_final))

# print("===========================================================================================")

# print("best parameters:", maxi_soft, best_ws_soft, best_thresh_soft, best_preds_soft)
# print("------------------------------before------------------------------")
# print(classification_report(voting_clf_soft.predict(X_test_vector_final), Y_test_final))
# print("------------------------------after------------------------------")
# print(classification_report(best_preds_soft, Y_test_final))

best parameters: 0.976384755687818 12 0.6000000000000001 [1 1 1 ... 1 1 1]
------------------------------before------------------------------
              precision    recall  f1-score   support

          -1       0.59      0.51      0.55      1364
           1       0.93      0.95      0.94     10119

    accuracy                           0.90     11483
   macro avg       0.76      0.73      0.75     11483
weighted avg       0.89      0.90      0.90     11483

------------------------------after------------------------------
              precision    recall  f1-score   support

          -1       0.71      0.85      0.77       969
           1       0.99      0.97      0.98     10514

    accuracy                           0.96     11483
   macro avg       0.85      0.91      0.87     11483
weighted avg       0.96      0.96      0.96     11483

best parameters: 0.9761402836335761 12 0.6000000000000001 [1 1 1 ... 1 1 1]
------------------------------before--------------------------

In [28]:
# with svc : 0.77, 0.98, 0.96 on test
# with soft : 0.75, 0.98, 0.96 on test

### --> Results after postprocessing (svc exemple)

In [29]:
print("------------------------------before------------------------------")
print(final_svc.predict(X_test_vector_final)[:500])
print("------------------------------after------------------------------")
print(final_postprocessing(final_svc.predict(X_test_vector_final)[:500], best_ws_svc, best_thresh_svc))

------------------------------before------------------------------
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1
  1  1  1  1 -1  1  1  1 -1  1  1  1  1  1  1  1 -1  1  1  1  1  1 -1  1
  1  1  1  1  1  1  1  1 -1  1 -1  1  1 -1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1 -1  1  1 -1  1  1 -1  1 -1  1  1 -1  1
 -1  1 -1  1 -1  1 -1  1 -1 -1 -1 -1  1  1  1 -1 -1 -1  1 -1  1 -1  1  1
  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1  1  1
  1 -1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1

### --> Training on whole dataset now that we learnt the best postprocessing parameters

In [30]:
fname = "./data/corpus.tache1.learn.utf8"
alltxts_train_final, alllabs_train_final = load_pres(fname)
X_train_final = np.array(alltxts_train_final)
Y_train_final = np.array(alllabs_train_final)

In [31]:
X_final_vector_svc = vectorizer_svc.fit_transform(X_train_final)

t0 = time()
final_svc.fit(X_final_vector_svc,  Y_train_final)
print(" done in %0.3fs" % (time() - t0))

# t0 = time()
# voting_clf_soft.fit(X_final_vector_svc,  Y_train_final)
# print(" done in %0.3fs" % (time() - t0))

 done in 1.992s
[Voting] ....................... (1 of 2) Processing lr, total=   7.5s
[LibSVM][Voting] ...................... (2 of 2) Processing svc, total=25.9min
 done in 1562.248s


### --> Computing final predictions

In [32]:
fname = "./data/corpus.tache1.test.utf8"
alltxts_test_final, _ = load_pres(fname)
X_test_final = np.array(alltxts_test_final)

In [33]:
X_test_vector_svc = vectorizer_svc.transform(X_test_final)
final_pred_svc = final_svc.predict(X_test_vector_svc)

# X_test_vector_svc = vectorizer_svc.transform(X_test_final)
# final_pred_soft = voting_clf_soft.predict(X_test_vector_svc)

In [34]:
new_final_pred_svc = final_postprocessing(final_pred_svc, best_ws_svc, best_thresh_svc)

# new_final_pred_soft = final_postprocessing(final_pred_soft, best_ws_soft, best_thresh_soft)

In [35]:
print(new_final_pred_svc.shape)

# print(new_final_pred_soft.shape)

(27162,)
(27162,)


In [42]:
print("BEFORE:")
print(final_pred_svc[500:1000])
print("AFTER:")
print(new_final_pred_svc[500:1000])

BEFORE:
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1 -1  1 -1  1  1
 -1 -1 -1  1  1 -1  1 -1 -1  1  1 -1  1 -1 -1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1 -1 -1  1  1  1  1 -1  1  1 -1  1  1  1 -1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1 -1 -1  1  1  1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1
  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1 

In [37]:
# print(final_pred_soft[:500])
# print(new_final_pred_soft[:500])

[ 1  1  1 -1 -1 -1 -1  1  1 -1  1  1 -1  1 -1  1 -1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1 -1 -1 -1  1  1  1 -1 -1  1 -1  1
  1  1  1  1  1 -1  1  1 -1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1 -1  1  1  1  1 -1  1  1  1 -1  1 -1  1 -1 -1  1 -1 -1 -1 -1 -1 -1
  1 -1  1 -1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1 -1 -1 -1  1  1  1  1 -1 -1
  1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1  1  1  1  1  1  1  1 -1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1

In [40]:
save=False
if save :
    f = open("../TME1/output/president_pred_svc.txt", "w")
    preds = ""
    for i in range(len(new_final_pred_svc)):
        preds += str(new_final_pred_svc[i]) + "\n"
    f.write(preds)
    f.close()

    f = open("../TME1/output/president_pred_soft.txt", "w")
    preds = ""
    for i in range(len(new_final_pred_soft)):
        preds += str(new_final_pred_soft[i]) + "\n"
    f.write(preds)
    f.close()
    
    f = open("../TME1/output/president_pred_soft_3.txt", "w")
    preds = ""
    for i in range(len(new_final_pred_soft_)):
        preds += str(new_final_pred_soft_[i]) + "\n"
    f.write(preds)
    f.close()

## 7. Results

In [41]:
# BEST PERFORMANCE SVC : 79% ACC ON TEST F1_MIT 
# BEST PERFORMANCE SOFT : 78% ACC ON TEST F1_MIT