# Import data and setups

In [None]:
! pip install -q transformers
! pip install -q textacy
! pip install -q PyDrive
! pip install -q wordcloud

In [None]:
import pandas as pd
import numpy as np
import datetime
import pickle
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import gensim
import string
import re
import unicodedata
import textacy
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.util import mark_negation
from nltk.stem import PorterStemmer
from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification, TextClassificationPipeline
from tensorflow.keras.optimizers import Adam
import zipfile

from google.colab import auth
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials

nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt')

rs = 27 # random_state - when/if needed
stemmer = PorterStemmer()

In [None]:
def my_df_to_gDrive(df:pd.DataFrame, file_name:str, folder_id:str, idx:bool=False):
    csv_file = df.to_csv(file_name, index=idx)
    file_list = drive.ListFile({'q': "'{}' in parents and trashed=false and title='{}'".format(folder_id, file_name)}).GetList()
    if len(file_list) > 0:
        f = file_list[0]
    else:
        f = drive.CreateFile({
            'title': file_name,
            'parents': [{'kind': 'drive#fileLink', 'id': folder_id}],
            'overwrite': True
        })
    with open(file_name, 'r') as temp:
        fc = temp.read()
    f.SetContentString(fc)
    f.Upload()

def my_gDrive_to_colab(file_name:str, folder_id:str):
    file_list = drive.ListFile({'q': "'{}' in parents and trashed=false and title='{}'".format(folder_id, file_name)}).GetList()
    if len(file_list) > 0:
        f = file_list[0]
        f.GetContentFile(file_name)
        return True
    else:
        print(f'File {file_name} does not exists in this Drive')
        return False

def my_register_scores(df_from:pd.DataFrame, df_to:pd.DataFrame, method_name:str):
    y_true = df_from['y_true']
    y_pred = df_from['y_pred']

    df_to.loc[method_name, 'accuracy_score'] = accuracy_score(y_true, y_pred)
    df_to.loc[method_name, 'precision_score'] = precision_score(y_true, y_pred)
    df_to.loc[method_name, 'recall_score'] = recall_score(y_true, y_pred)
    df_to.loc[method_name, 'f1_score'] = f1_score(y_true, y_pred)
    # df_to.loc[method_name, 'cm'] = confusion_matrix(y_true, y_pred)

def my_preprocess_string(
    x:str,
    all_lower:bool=True,
    remove_numbers:bool=True,
    replace_accents:bool=True,
    # fix_spelling:bool=True,
    # expand_abbreviations:bool=True,
    handle_negation:bool=True,
    remove_ponctuation:bool=True,
    remove_stop_words:bool=True,
    apply_stemming:bool=True,
    remove_white_spaces:bool=True
):
    if all_lower:
        x = x.lower()
    if remove_numbers:
        x = re.sub(r'\d+', '', x)
    if replace_accents:
        x = ''.join(c for c in unicodedata.normalize('NFD', x) if unicodedata.category(c) != 'Mn')
    if remove_white_spaces:
        x = re.sub(' +', ' ', x).strip()
    # if fix_spelling:
    #     pass
    # if expand_abbreviations:
    #     pass
    if handle_negation:
        x = ' '.join(mark_negation(word_tokenize(re.sub(r"[^\w\s']", '.', x))))
    if remove_ponctuation:
        x = re.sub(r"[^\w\s]", ' ', x)
    if remove_stop_words:
        x = ' '.join(xi for xi in x.split() if (xi if not xi.endswith('_NEG') else xi[:-4]) not in set(stopwords.words('english')))
    if apply_stemming:
        x = ' '.join(stemmer.stem(xi if not xi.endswith('_NEG') else xi[:-4]) + ('' if not xi.endswith('_NEG') else '_NEG') for xi in x.split())
    if remove_white_spaces:
        x = re.sub(' +', ' ', x).strip()
    
    return x

def my_preprocessor(
    df:pd.DataFrame,
    all_lower:bool=True,
    remove_numbers:bool=True,
    replace_accents:bool=True,
    # fix_spelling:bool=True,
    # expand_abbreviations:bool=True,
    handle_negation:bool=True,
    remove_ponctuation:bool=True,
    remove_stop_words:bool=True,
    apply_stemming:bool=True,
    remove_white_spaces:bool=True
):
    dff = df.copy()
    dff['text'] = dff['text'].map(lambda x: my_preprocess_string(
        x,
        all_lower=all_lower,
        remove_numbers=remove_numbers,
        replace_accents=replace_accents,
        # fix_spelling=fix_spelling,
        # expand_abbreviations=expand_abbreviations,
        handle_negation=handle_negation,
        remove_ponctuation=remove_ponctuation,
        remove_stop_words=remove_stop_words,
        apply_stemming=apply_stemming,
        remove_white_spaces=remove_white_spaces
    ))

    return dff

def my_explode_text(df:pd.DataFrame):
    dff = df.copy()    

    dff['word'] = dff['text'].str.split()
    dff['original_index'] = dff.index
    dff = dff.explode('word')

    return dff

def my_invert_sl(df):
    df_neg = df.copy()
    df_neg['English'] += '_NEG'
    df_neg['Positive'] = 1 - df_neg['Positive']
    df_neg['Negative'] = 1 - df_neg['Negative']
    
    return pd.concat([df, df_neg])

def my_bag_of_words(df_train, df_test, model):
    vectorizer = eval(model)
    X_BOG_train = vectorizer.fit_transform(df_train['text'])
    X_BOG_test = vectorizer.transform(df_test['text'])

    return (X_BOG_train, X_BOG_test, vectorizer)

def my_str2vec(df_train, df_test, vs, w, mc):
    dff_train = df_train.copy()
    

    tokenized_sentences_train = [sentence.split() for sentence in dff_train['text']]
    train_words = set()
    for sentence in tokenized_sentences_train:
        for word in sentence:
            train_words.add(str(word))
    try:
        model = gensim.models.Word2Vec(tokenized_sentences_train, size=vs, window=w, min_count=mc, workers=-1)
        s2v_train = np.array([
            np.mean([model[word] for word in sentence], axis=0)
            for sentence in tokenized_sentences_train
        ])
        dff_test = df_test.copy()
        tokenized_sentences_test = [sentence.split() for sentence in dff_test['text']]
        s2v_test = np.array([
            np.mean([model[word] if str(word) in train_words else np.zeros(model.vector_size) for word in sentence], axis=0)
            for sentence in tokenized_sentences_test
        ])
    except Exception as e:
        model = gensim.models.Word2Vec(tokenized_sentences_train, vector_size=vs, window=w, min_count=mc, workers=-1)
        s2v_train = np.array([
            np.mean([model.wv[word] for word in sentence], axis=0)
            for sentence in tokenized_sentences_train
        ])
        dff_test = df_test.copy()
        tokenized_sentences_test = [sentence.split() for sentence in dff_test['text']]
        s2v_test = np.array([
            np.mean([model.wv[word] if str(word) in train_words else np.zeros(model.vector_size) for word in sentence], axis=0)
            for sentence in tokenized_sentences_test
        ])
    
    return (
        s2v_train,
        s2v_test,
        model
    )

def my_baseline(df:pd.DataFrame):
    dff = df.copy()
    dff['y'] = dff['text'].map(lambda x: TextBlob(x).sentiment.polarity)
    dff['y_true'] = np.where(dff['label'] == 'pos', 1, 0)
    dff['y_pred'] = np.where(dff['y'] >= 0, 1, 0)
    return dff

def my_sentiment_lexicon(df:pd.DataFrame, df_sl:pd.DataFrame):
    dff = df.copy()
    dff_sl = my_explode_text(df). \
        merge(df_sl, left_on='word', right_on='English'). \
        groupby('original_index'). \
        agg({'Positive': 'sum', 'Negative': 'sum'})
    dff_sl['pred_label'] = np.where(dff_sl['Positive'] >= dff_sl['Negative'], 'pos', 'neg')

    dff['pred_label'] = dff_sl['pred_label']
    dff['y_pred'] = np.where(dff['pred_label'] == 'pos', 1, 0)
    dff['y_true'] = np.where(dff['label'] == 'pos', 1, 0)
    
    return dff

def my_ML(X_key, embeddings_key, models_key, X, embeddings, models):
    if embeddings_key[:3]=='BOW':
        X_train, X_test, vectorizer = my_bag_of_words(*X[X_key], embeddings[embeddings_key])
    else:
        X_train, X_test, vectorizer = my_str2vec(*X[X_key], *embeddings[embeddings_key])

    clf = eval(models[models_key])
    model = clf.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    my_register_scores(
        pd.DataFrame({'y_true': y_train, 'y_pred': y_train_pred}),
        df_results,
        f'ML - {X_key} - {embeddings_key} - {models_key} - train'
    )
    my_register_scores(
        pd.DataFrame({'y_true': y_test, 'y_pred': y_test_pred}),
        df_results,
        f'ML - {X_key} - {embeddings_key} - {models_key} - test'
    )
    return model, vectorizer

def my_predict(string_to_sa, prep_version, vec, clf):
    str_to_sa = my_preprocess_string(string_to_sa, **prep_version)
    vec_to_sa = vec.transform([str_to_sa])
    return clf.predict_proba(vec_to_sa)[0][1], str_to_sa, vec_to_sa

def my_sa(string_to_sa, prep_version, vec, clf):
    p, s, v = my_predict(string_to_sa, prep_version, vec, clf)
    df_coef = pd.DataFrame({
        'val': pd.Series(np.array(v.todense())[0]) * pd.Series(clf.coef_[0]),
        'name': vec.get_feature_names_out()
    })
    return p, s, df_coef[df_coef['val'] != 0]

def my_wc(v):
    from wordcloud import WordCloud

    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        color_func=lambda *args, **kwargs: dict(zip(v['name'], np.where(v['val']>0, '#00FF00', '#FF0000'))).get(args[0])
    ).generate_from_frequencies(dict(zip(v['name'], v['val'].abs())))
                        
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)

    plt.show()

In [None]:
auth.authenticate_user()

gauth = GoogleAuth()

gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

folder_id = '1-UkmMimz0POOr4LXOv1XDVrc8Ktnxk2k'
download_list = ['imdb_reviews_test.csv', 'imdb_reviews_train.csv', 'NCR-lexicon.csv']

for i in download_list:
    my_gDrive_to_colab(i, folder_id)

In [None]:
df_test = pd.read_csv('imdb_reviews_test.csv')
df_train = pd.read_csv('imdb_reviews_train.csv')

df_sl_raw = pd.read_csv('NCR-lexicon.csv')
df_sl_raw = df_sl_raw[(df_sl_raw['Positive']==1) | (df_sl_raw['Negative']==1)].reset_index(drop=True)

df_sl = my_invert_sl(df_sl_raw)

df_results = pd.DataFrame()

In [None]:
print(len(df_train), '\n')
print(len(df_test), '\n')
print(df_train['text'].map(lambda x: len(x)).describe(), '\n')
print(df_test['text'].map(lambda x: len(x)).describe(), '\n')
print(df_train['label'].map(lambda x: 1 if x == 'pos' else 0).describe(), '\n')
print(df_test['label'].map(lambda x: 1 if x == 'pos' else 0).describe(), '\n')

# Pre-processed Texts

In [None]:
prep_v1 = {
    'all_lower': True,
    'remove_numbers': True,
    'replace_accents': True,
    # 'fix_spelling': True,
    # 'expand_abbreviations': True,
    'handle_negation': True,
    'remove_ponctuation': True,
    'remove_stop_words': True,
    'apply_stemming': True,
    'remove_white_spaces': True
}

prep_v2 = {
    'all_lower': True,
    'remove_numbers': True,
    'replace_accents': True,
    # 'fix_spelling': True,
    # 'expand_abbreviations': True,
    'handle_negation': True,
    'remove_ponctuation': True,
    'remove_stop_words': False,
    'apply_stemming': False,
    'remove_white_spaces': True
}

prep_v3 = {
    'all_lower': True,
    'remove_numbers': True,
    'replace_accents': True,
    # 'fix_spelling': True,
    # 'expand_abbreviations': True,
    'handle_negation': True,
    'remove_ponctuation': True,
    'remove_stop_words': True,
    'apply_stemming': False,
    'remove_white_spaces': True
}

prep_v4 = {
    'all_lower': True,
    'remove_numbers': True,
    'replace_accents': True,
    # 'fix_spelling': True,
    # 'expand_abbreviations': True,
    'handle_negation': False,
    'remove_ponctuation': True,
    'remove_stop_words': True,
    'apply_stemming': False,
    'remove_white_spaces': True
}

In [None]:
if my_gDrive_to_colab('df_test_v1.csv', folder_id):
    df_test_v1 = pd.read_csv('df_test_v1.csv')
else:
    df_test_v1 = my_preprocessor(df_test, **prep_v1)
    my_df_to_gDrive(df_test_v1, 'df_test_v1.csv', folder_id)
    
if my_gDrive_to_colab('df_train_v1.csv', folder_id):
    df_train_v1 = pd.read_csv('df_train_v1.csv')
else:
    df_train_v1 = my_preprocessor(df_train, **prep_v1)
    my_df_to_gDrive(df_train_v1, 'df_train_v1.csv', folder_id)

In [None]:
if my_gDrive_to_colab('df_test_v2.csv', folder_id):
    df_test_v2 = pd.read_csv('df_test_v2.csv')
else:
    df_test_v2 = my_preprocessor(df_test, **prep_v2)
    my_df_to_gDrive(df_test_v2, 'df_test_v2.csv', folder_id)
    
if my_gDrive_to_colab('df_train_v2.csv', folder_id):
    df_train_v2 = pd.read_csv('df_train_v2.csv')
else:
    df_train_v2 = my_preprocessor(df_train, **prep_v2)
    my_df_to_gDrive(df_train_v2, 'df_train_v2.csv', folder_id)

In [None]:
if my_gDrive_to_colab('df_test_v3.csv', folder_id):
    df_test_v3 = pd.read_csv('df_test_v3.csv')
else:
    df_test_v3 = my_preprocessor(df_test, **prep_v3)
    my_df_to_gDrive(df_test_v3, 'df_test_v3.csv', folder_id)
    
if my_gDrive_to_colab('df_train_v3.csv', folder_id):
    df_train_v3 = pd.read_csv('df_train_v3.csv')
else:
    df_train_v3 = my_preprocessor(df_train, **prep_v3)
    my_df_to_gDrive(df_train_v3, 'df_train_v3.csv', folder_id)

In [None]:
if my_gDrive_to_colab('df_test_v4.csv', folder_id):
    df_test_v4 = pd.read_csv('df_test_v4.csv')
else:
    df_test_v4 = my_preprocessor(df_test, **prep_v4)
    my_df_to_gDrive(df_test_v4, 'df_test_v4.csv', folder_id)
    
if my_gDrive_to_colab('df_train_v4.csv', folder_id):
    df_train_v4 = pd.read_csv('df_train_v4.csv')
else:
    df_train_v4 = my_preprocessor(df_train, **prep_v3)
    my_df_to_gDrive(df_train_v4, 'df_train_v4.csv', folder_id)

# Baseline

In [None]:
df_baseline = my_baseline(df_test)
my_register_scores(df_baseline, df_results, 'TextBlob')

df_results

# Using Sentiment Lexicon

In [None]:
df_sentiment_lexicon_raw = my_sentiment_lexicon(df_test, df_sl)
my_register_scores(df_sentiment_lexicon_raw, df_results, 'Sentiment Lexicon - Raw')

df_sentiment_lexicon_v1 = my_sentiment_lexicon(df_test_v1, df_sl)
my_register_scores(df_sentiment_lexicon_v1, df_results, 'Sentiment Lexicon - v1_raw')

df_sl_v1 = my_invert_sl(
    my_preprocessor(df_sl_raw.rename({'English': 'text'}, axis='columns'), **prep_v1). \
    rename({'text': 'English'}, axis='columns'). \
    groupby('English', as_index=False). \
    agg({'Positive': 'max', 'Negative': 'max'})
)

df_sentiment_lexicon_v1_1 = my_sentiment_lexicon(df_test_v1, df_sl_v1)
my_register_scores(df_sentiment_lexicon_v1_1, df_results, 'Sentiment Lexicon - v1_1')

df_sentiment_lexicon_v2_2 = my_sentiment_lexicon(df_test_v2, df_sl)
my_register_scores(df_sentiment_lexicon_v2_2, df_results, 'Sentiment Lexicon - v2_raw')

df_sl_v2 = my_invert_sl(
    my_preprocessor(
        df_sl_raw.rename({'English': 'text'}, axis='columns'),
        **prep_v2
    ). \
    rename({'text': 'English'}, axis='columns'). \
    groupby('English', as_index=False). \
    agg({'Positive': 'max', 'Negative': 'max'})
)

df_sentiment_lexicon_v2_2 = my_sentiment_lexicon(df_test_v2, df_sl_v2)
my_register_scores(df_sentiment_lexicon_v2_2, df_results, 'Sentiment Lexicon - v2_2')

df_results

# Using Machine Learning

In [None]:
y_train = np.where(df_train['label']=='pos', 1, 0)
y_test = np.where(df_test['label']=='pos', 1, 0)

In [None]:
X = {
    'raw': (df_train, df_test),
    'v1': (df_train_v1, df_test_v1),
    'v2': (df_train_v2, df_test_v2),
    'v3': (df_train_v3, df_test_v3),
    'v4': (df_train_v4, df_test_v4),
}

embeddings = {
    'BOW_default': "TfidfVectorizer()",
    'BOW_sa_sw': "TfidfVectorizer(strip_accents='ascii', stop_words='english')",
    's2v_100_5_1': (100, 5, 1),
    's2v_200_5_1': (200, 5, 1),
}

models = {
    'bnb': "BernoulliNB()",
    'lr': "LogisticRegression(max_iter=1000, random_state=rs)",
    'svm': "svm.LinearSVC(random_state=rs)",
    'dtc_10': "DecisionTreeClassifier(max_depth=10, random_state=rs)",
    'rfc_10': "RandomForestClassifier(max_depth=10, random_state=rs)",
    'dtc_20': "DecisionTreeClassifier(max_depth=20, random_state=rs)",
    'rfc_20': "RandomForestClassifier(max_depth=20, random_state=rs)",
}

save_to = 'drive/MyDrive/ML_models/'

for i in X:
    for j in embeddings:
        for k in models:
            m_v = my_ML(i, j, k, X, embeddings, models)
            with open(f'{save_to}{i}-{j}-{k}.pickle', 'wb') as f:
                pickle.dump(m_v, f)

df_results.loc[df_results.index.str.endswith('- test')].sort_values('accuracy_score', ascending=False).iloc[:30]

In [None]:
X_ft = {
    'v2': (df_train_v2, df_test_v2),
    'v3': (df_train_v3, df_test_v3)
}

embeddings_ft = {
    'BOW_default': "TfidfVectorizer()"
}

models_ft = {
    'lr-elasticnet0.5': "LogisticRegression(penalty='elasticnet', l1_ratio=0.5, solver='saga', max_iter=1000, random_state=rs)",
    'lr-elasticnet0.25': "LogisticRegression(penalty='elasticnet', l1_ratio=0.25, solver='saga', max_iter=1000, random_state=rs)",
    'lr-elasticnet0.75': "LogisticRegression(penalty='elasticnet', l1_ratio=0.75, solver='saga', max_iter=1000, random_state=rs)",
    'svm-10': "svm.LinearSVC(C=10, random_state=rs)",
    'svm-2': "svm.LinearSVC(C=2, random_state=rs)",
    'svm-0.5': "svm.LinearSVC(C=0.5, random_state=rs)",
    'svm-0.05': "svm.LinearSVC(C=0.05, random_state=rs)",
}

save_to = 'drive/MyDrive/ML_models/'

for i in X_ft:
    for j in embeddings_ft:
        for k in models_ft:
            m_v = my_ML(i, j, k, X_ft, embeddings_ft, models_ft)
            with open(f'{save_to}{i}-{j}-{k}.pickle', 'wb') as f:
                pickle.dump(m_v, f)

my_df_to_gDrive(df_results.loc[df_results.index.str.startswith('ML - ')], "df_final_ML_results.csv", folder_id, True)

In [None]:
my_gDrive_to_colab('df_final_ML_results.csv', folder_id)

df_ML_results = pd.read_csv('df_final_ML_results.csv')

# Using Transformers

In [None]:
def my_load_model_from_name(model_name):
    tk = AutoTokenizer.from_pretrained(model_name)
    try:
        md = TFAutoModelForSequenceClassification.from_pretrained(model_name)
    except Exception as e:
        print(e)
        md = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True)
    md.compile(optimizer=Adam(3e-5))
    return (tk, md)

def my_fine_tune(texts, labels, md, tk):
    tk_data = dict(tk(texts, return_tensors="np", padding=True))
    md.fit(tk_data, labels)
    return md

def my_fine_tune_pipeline(df, model_name, batch_size=16, epochs=2):
    dfs = [df.iloc[i:i+batch_size].copy() for i in range(0, len(df), batch_size)]
    tk, md = my_load_model_from_name(model_name)
    for epoch in range(epochs):
        for dff in dfs:
            md = my_fine_tune(list(dff['text']), np.where(dff['label']=='pos', 1, 0), md, tk)
    return (tk, md)

# Aplicação dos transformadores no conjunto de teste (demora cerca de 3 horas por transformador)

"""

df_transformers = df_test.copy() # .sample(frac=0.5, random_state=rs).copy()

models = [
    "distilbert-base-uncased-finetuned-sst-2-english",
    "lvwerra/distilbert-imdb",
]

df_transformers['y_true'] = np.where(df_transformers['label'] == 'pos', 1, 0)
transformers_y_true = df_transformers['y_true']
transformers_cm = []

df_transformers['y_true'] = np.where(df_transformers['label'] == 'pos', 1, 0)
transformers_y_true = df_transformers['y_true']

classifier = pipeline("sentiment-analysis", model=models[0]) #, top_k=2)
df_transformers[f'pred_label_{models[0][:3]}'] = df_transformers['text'].map(lambda x: classifier(x))


my_df_to_gDrive(
    df_transformers,
    f"df_transformers_{str(datetime.datetime.now())[:19].replace(' ', '_').replace(':', '')}.csv",
    folder_id,
    True
)

df_transformers['y_true'] = np.where(df_transformers['label'] == 'pos', 1, 0)
transformers_y_true = df_transformers['y_true']

classifier = pipeline("sentiment-analysis", model=models[1]) #, top_k=2)
df_transformers[f'pred_label_{models[1][:3]}'] = df_transformers['text'].map(lambda x: classifier(x))

my_df_to_gDrive(
    df_transformers,
    f"df_transformers_{str(datetime.datetime.now())[:19].replace(' ', '_').replace(':', '')}.csv",
    folder_id,
    True
)
"""

# Fine tunning dos transformadores

"""
model_name = "lvwerra/distilbert-imdb"

df_shuffled = df_train.copy().sample(frac=1, random_state=rs)
batch_size = 5000

for dff_shuffled in [df_shuffled.iloc[i:i+batch_size].copy() for i in range(0, len(df_shuffled), batch_size)]:
    ft_name = model_name

    tk, md = my_fine_tune_pipeline(dff_shuffled, ft_name, 20, 1)

    curr_timestamp = str(datetime.datetime.now())[:19].replace(' ', '_').replace(':', '')
    ft_name = f"drive/MyDrive/TMCD/my_fine_tune_5000_{model_name.replace('/', '-')}_{curr_timestamp}"

    md.save_pretrained(ft_name)
    tk.save_pretrained(ft_name)

    dff = df_test.copy().sample(n=20)
    classifier = pipeline("sentiment-analysis", model=ft_name) #, top_k=2)
    dff[f'mft_{curr_timestamp}'] = dff['text'].map(lambda x: classifier(x))

    display(dff)
"""

"""
model_name = 'drive/MyDrive/TMCD/my_fine_tune_5000_lvwerra-distilbert-imdb_2023-03-24_174929'
model_name_base = 'lvwerra-distilbert-imdb'

df_shuffled = df_train.copy().sample(frac=1, random_state=rs)

ft_name = model_name

for i in range(6):

    tk, md = my_fine_tune_pipeline(df_shuffled, ft_name, 20, 1)

    curr_timestamp = str(datetime.datetime.now())[:19].replace(' ', '_').replace(':', '')
    ft_name = f"drive/MyDrive/TMCD/my_fine_tune_5000_{i}_{model_name_base.replace('/', '-')}_{curr_timestamp}"

    md.save_pretrained(ft_name)
    tk.save_pretrained(ft_name)

    dff = df_test.copy().sample(n=20)
    classifier = pipeline("sentiment-analysis", model=ft_name) #, top_k=2)
    dff[f'mft_{curr_timestamp}'] = dff['text'].map(lambda x: classifier(x))

    display(dff)
"""

# Aplicação dos transformadores com fine tuning no conjunto de teste (demora cerca de 3 horas por transformador)

"""
my_fine_tuned_models = [
    'my_fine_tune_3_lvwerra-distilbert-imdb',
    'my_fine_tune_1_epoch_lvwerra-distilbert-imdb',
    'my_fine_tune_5_lvwerra-distilbert-imdb',
    'my_fine_tune_2_epoch_lvwerra-distilbert-imdb',
    'my_fine_tune_4_lvwerra-distilbert-imdb'
]

name_order = [3,1,5,2,4]

for i in my_fine_tuned_models:
    with zipfile.ZipFile(i+'.zip', 'r') as zip_ref:
        zip_ref.extractall()

df_transformers = df_test.copy() # .sample(frac=0.5, random_state=rs).copy()


for j, mftm in enumerate(my_fine_tuned_models):
    classifier = pipeline("sentiment-analysis", model=mftm) #, top_k=2)
    df_transformers[f'pred_label_mft_{name_order[j]}'] = df_transformers['text'].map(lambda x: classifier(x))

    tstamp = str(datetime.datetime.now())[:19].replace(' ', '_').replace(':', '')
    df_transformers.to_csv(f"df_mft_{tstamp}.csv", index=True)
"""

In [None]:
my_gDrive_to_colab('df_transformers_2023-03-21_190316.csv', folder_id)

df_transformers = pd.read_csv('df_transformers_2023-03-21_190316.csv', index_col=0)

df_transformers['y_dis'] = df_transformers['pred_label_dis'].map(lambda x: 1 if eval(x)[0]['label'] == 'POSITIVE' else 0)
df_transformers['y_lvw'] = df_transformers['pred_label_lvw'].map(lambda x: 1 if eval(x)[0]['label'] == 'POSITIVE' else 0)

my_register_scores(
    pd.DataFrame({'y_true': df_transformers['y_true'], 'y_pred': df_transformers['y_dis']}),
    df_results,
    "Transformers - distilbert-sst2"
)

my_register_scores(
    pd.DataFrame({'y_true': df_transformers['y_true'], 'y_pred': df_transformers['y_lvw']}),
    df_results,
    "Transformers - distilbert-imdb"
)

df_results

In [None]:
my_gDrive_to_colab('df_mft_2023-03-25_140738.csv', folder_id)

df_mft = pd.read_csv('df_mft_2023-03-25_140738.csv', index_col=0)

df_mft['y_true'] = df_mft['label'].map(lambda x: 1 if x == 'pos' else 0)

df_mft['y_mft_1'] = df_mft['pred_label_mft_1'].map(lambda x: 1 if eval(x)[0]['label'] == 'POSITIVE' else 0)
df_mft['y_mft_2'] = df_mft['pred_label_mft_2'].map(lambda x: 1 if eval(x)[0]['label'] == 'POSITIVE' else 0)
df_mft['y_mft_3'] = df_mft['pred_label_mft_3'].map(lambda x: 1 if eval(x)[0]['label'] == 'POSITIVE' else 0)
df_mft['y_mft_4'] = df_mft['pred_label_mft_4'].map(lambda x: 1 if eval(x)[0]['label'] == 'POSITIVE' else 0)
df_mft['y_mft_5'] = df_mft['pred_label_mft_5'].map(lambda x: 1 if eval(x)[0]['label'] == 'POSITIVE' else 0)

my_register_scores(
    pd.DataFrame({'y_true': df_mft['y_true'], 'y_pred': df_mft['y_mft_1']}),
    df_results,
    "Transformers - Fine Tuned 1 Epoch"
)

my_register_scores(
    pd.DataFrame({'y_true': df_mft['y_true'], 'y_pred': df_mft['y_mft_2']}),
    df_results,
    "Transformers - Fine Tuned 2 Epoch"
)

my_register_scores(
    pd.DataFrame({'y_true': df_mft['y_true'], 'y_pred': df_mft['y_mft_3']}),
    df_results,
    "Transformers - Fine Tuned 3 Epoch"
)

my_register_scores(
    pd.DataFrame({'y_true': df_mft['y_true'], 'y_pred': df_mft['y_mft_4']}),
    df_results,
    "Transformers - Fine Tuned 4 Epoch"
)

my_register_scores(
    pd.DataFrame({'y_true': df_mft['y_true'], 'y_pred': df_mft['y_mft_5']}),
    df_results,
    "Transformers - Fine Tuned 5 Epoch"
)


df_results

# Results

In [None]:
import matplotlib.pyplot as plt

df_name = 'df_final_ML_results.csv'
my_gDrive_to_colab(df_name, folder_id)
df_ML_results = pd.read_csv(df_name, index_col=0)
ini_index = list(df_ML_results.index)
for i in ini_index:
    if i.endswith('train'):
        df_ML_results.loc[i.replace(' - train', ''), 'accuracy_train'] = df_ML_results.loc[i, 'accuracy_score']
        df_ML_results.loc[i.replace(' - train', ''), 'f1_train'] = df_ML_results.loc[i, 'f1_score']
    elif i.endswith('test'):
        df_ML_results.loc[i.replace(' - test', ''), 'accuracy_test'] = df_ML_results.loc[i, 'accuracy_score']
        df_ML_results.loc[i.replace(' - test', ''), 'f1_test'] = df_ML_results.loc[i, 'f1_score']
df_ML_results = df_ML_results.loc[~df_ML_results.index.isin(ini_index), ['accuracy_train', 'f1_train', 'accuracy_test', 'f1_test']].copy()
df_ML_results['model'] = df_ML_results.index
df_ML_results[['preprocessing', 'embedding', 'model']] = df_ML_results['model'].str.replace('ML - ', '').str.split(' - ', expand=True)

df_ML_results_preprocessing = df_ML_results.sort_values('accuracy_test', ascending=False).groupby('preprocessing').first()[['accuracy_train', 'accuracy_test', 'embedding' ,'model']].sort_values('accuracy_test', ascending=False)
df_ML_results_embedding = df_ML_results.sort_values('accuracy_test', ascending=False).groupby('embedding').first()[['accuracy_train', 'accuracy_test', 'preprocessing', 'model']].sort_values('accuracy_test', ascending=False)
df_ML_results_model = df_ML_results.sort_values('accuracy_test', ascending=False).groupby('model').first()[['accuracy_train', 'accuracy_test', 'preprocessing', 'embedding']].sort_values('accuracy_test', ascending=False)

display(df_ML_results_preprocessing)
display(df_ML_results_embedding)
display(df_ML_results_model)

plt.figure(figsize=(21,5))
model_names = df_ML_results_model.index + '\n(' + df_ML_results_model['preprocessing'] + ')\n(' + df_ML_results_model['embedding'] + ')'

for i, j in enumerate(zip(df_ML_results_model['accuracy_train'], df_ML_results_model['accuracy_test'])):
    plt.plot([i, i], [j[0], j[1]], '-', color='red', alpha=0.5, linewidth=2)

plt.plot(model_names, df_ML_results_model['accuracy_train'], 'o', color='blue', label='Train Accuracy')
plt.plot(model_names, df_ML_results_model['accuracy_test'], 'o', color='green', label='Test Accuracy')

plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Train and Test Accuracy for Different Models (w/ Best Preprocessing and Embedding)')
plt.legend()

plt.show()

# good_models = [
#     'drive/MyDrive/ML_models/v3-BOW_default-lr-elasticnet0.75.pickle',
#     'drive/MyDrive/ML_models/v3-BOW_default-lr-elasticnet0.25.pickle',
#     'drive/MyDrive/ML_models/v3-BOW_default-lr.pickle'
# ]
# 
# with open(good_models[1], 'rb') as f:
#     clf, vec = pickle.load(f)
# 
# string = df_train.iloc[0,0] 
# p, s, v = my_sa(string, prep_v3, vec, clf)
# my_wc(v)

In [None]:
df_ML_results_model_lr_svm = df_ML_results_model.loc[['lr', 'lr-elasticnet0.25', 'lr-elasticnet0.5', 'lr-elasticnet0.75', 'svm-0.05', 'svm-0.5', 'svm', 'svm-2', 'svm-10'], :]

plt.figure(figsize=(13,4))
model_names = df_ML_results_model_lr_svm.index + '\n(' + df_ML_results_model_lr_svm['preprocessing'] + ')\n(' + df_ML_results_model_lr_svm['embedding'] + ')'

for i, j in enumerate(zip(df_ML_results_model_lr_svm['accuracy_train'], df_ML_results_model_lr_svm['accuracy_test'])):
    plt.plot([i, i], [j[0], j[1]], '-', color='red', alpha=0.5, linewidth=2)

plt.plot(model_names, df_ML_results_model_lr_svm['accuracy_train'], 'o', color='blue', label='Train Accuracy')
plt.plot(model_names, df_ML_results_model_lr_svm['accuracy_test'], 'o', color='green', label='Test Accuracy')

plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Train and Test Accuracy for Different Models (w/ Best Preprocessing and Embedding)')
plt.legend()

In [None]:
df_results_ft = df_results[df_results.index.map(lambda x: 'Fine Tuned' in x)]

plt.figure(figsize=(8,4))

plt.plot(["1", "2", "3", "4", "5"], df_results_ft['accuracy_score'], '-o')

plt.xlabel('# of Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy Over Epochs in Fine Tuning')
plt.legend()