In [None]:
import sys
sys.path.append('../src')
from data import get_documents_list, get_cleaned_dataframe_with_topics, get_data_and_labels_lists, get_sbert_models_path, get_languages
from models import classification
from sentence_transformers import SentenceTransformer
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Choose Parameters: 
- text_type = "cleaned_texts", "cleaned_sentences", "cleaned_paragraphs"
    -> text type for applying classification on
    
e.g. text_type = "cleaned_paragraphs", model = "distiluse-base-multilingual-cased-v1"

In [None]:
text_type = "sentences"
model = "distiluse-base-multilingual-cased-v1"

https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models

In [None]:
docs, targets = get_data_and_labels_lists(text_type)

In [None]:
languages_string =  "_".join(get_languages())
model_path = get_sbert_models_path().joinpath(f'st_{languages_string}_{text_type}_{model}.pkl')

In [None]:
docs, targets = get_data_and_labels_lists(text_type)
try:
    #Load sentences & embeddings from disc
    with open(model_path, "rb") as fIn:
        stored_data = pickle.load(fIn)
        data = stored_data['data']
        embeddings = stored_data['data_embeddings']
        df = pd.DataFrame(list(zip(docs, embeddings, targets)), columns =['data', 'data_embeddings', 'label'])
except FileNotFoundError as err:
    embedder = SentenceTransformer(model)
    df = pd.DataFrame(list(zip(docs, targets)), columns =['data', 'label'])
    df['data_embeddings'] = df['data'].apply(embedder.encode)
    embeddings = df['data_embeddings'].tolist()
    #Store sentences & embeddings on disc
    with open(model_path, "wb") as fOut:
        pickle.dump({'data': docs, 'data_embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['data_embeddings'], df['label'], test_size=0.2)

In [None]:
X_train_vect_avg = classification.average_text_vector(X_train)
X_test_vect_avg = classification.average_text_vector(X_test)

In [None]:
resampling = True

In [None]:
if not resampling:
    print("CLASSIFICATION WITHOUT SAMPLING:")
    #y_pred = classification.classify_predict(X_train_vect_avg, X_test_vect_avg, y_train, "Random Forest")
    #classification.print_classification_report(y_test, y_pred)

    y_pred = classification.classify_predict(X_train_vect_avg, X_test_vect_avg, y_train, "Multinomial Naive Bayes")
    classification.print_classification_report(y_test, y_pred)

    y_pred = classification.classify_predict(X_train_vect_avg, X_test_vect_avg, y_train, "SVM")
    classification.print_classification_report(y_test, y_pred)
    
    print(y_train.value_counts())
    y_train.value_counts().plot.pie(autopct='%0.2f')
    
else:
    ros = RandomOverSampler(sampling_strategy="not majority")
    rus = RandomUnderSampler(random_state=0, sampling_strategy="majority")
    
    for sampler, method in [(ros, "OVERSAMPLING"),(rus, "UNDERSAMPLING")]:
        print(f"CLASSIFICATION WITH:{method}")
            
        X_train_2d = X_train.values.reshape(-1, 1) # puts each elem of array in own array
        X_res, y_res = sampler.fit_resample(X_train_2d, y_train)
        
        print(y_res.value_counts())
        y_res.value_counts().plot.pie(autopct='%0.2f')
        
        #X_res revert 2_dimensionality
        X_res_1d = X_res.flatten()

        # classification with resampled vectors
        X_res_vect_avg = classification.average_text_vector(X_res_1d)

        #y_res_pred = classification.classify_predict(X_res_vect_avg, X_test_vect_avg, y_res, "Random Forest")
        #classification.print_classification_report(y_test, y_res_pred)
        y_res_pred = classification.classify_predict(X_res_vect_avg, X_test_vect_avg, y_res, "Multinomial Naive Bayes")
        classification.print_classification_report(y_test, y_res_pred)
        y_res_pred = classification.classify_predict(X_res_vect_avg, X_test_vect_avg, y_res, "SVM")
        classification.print_classification_report(y_test, y_res_pred)