In [15]:
!pip install -U sentence-transformers



## Importando bibliotecas e bases

In [16]:
# importando bibliotecas
import pandas as pd
import numpy as np
import math
# import torch
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

In [17]:
from google.colab import drive
drive.mount('/content/drive/')
df = pd.read_csv('/content/drive/MyDrive/USP/10º semestre/TCC/Reddit_Data.csv')
# df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)  # Semente fixa para reprodutibilidade

numero_de_instancias_por_categoria = 1000
categorias = df['category'].unique()
sampled_dfs = [df[df['category'] == cat].sample(n=numero_de_instancias_por_categoria, random_state=42) for cat in categorias]

# Combinar as amostras em um novo DataFrame
df = pd.concat(sampled_dfs).reset_index(drop=True)
df

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


Unnamed: 0,clean_comment,category
0,has ended but wait there see you next time do...,1
1,dammit good and got good speechwriters let see...,1
2,all aspiring music producer who mainly does h...,1
3,funnily enough sex education was banned congre...,1
4,the real reason rjd don want support him that ...,1
...,...,...
2995,call lalit modi,0
2996,why not delhi that way you save the money too,0
2997,when america going get one these anti corrupti...,0
2998,kejriwal leading with 3500 votes over sheila,0


In [18]:
import re

def preprocess_text(text):

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    return text

NovaColuna = []
for content in df['clean_comment']:
    if isinstance(content, str):
        preprocessed_content = preprocess_text(content)
        NovaColuna.append(preprocessed_content)
    else:
        NovaColuna.append('')  # Adicione uma string vazia para elementos não-string

df['text_cleaned'] = NovaColuna


In [19]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [20]:
category_mapping = {1: 'positive', 0: 'neutral', -1: 'negative'}
df['category'] = df['category'].map(category_mapping)
# Ativar linha quando for retirar a categoria neutral
# df = df[df['category']!='neutral'].reset_index(drop=True)
df

Unnamed: 0,clean_comment,category,text_cleaned
0,has ended but wait there see you next time do...,positive,has ended but wait there see you next time do...
1,dammit good and got good speechwriters let see...,positive,dammit good and got good speechwriters let see...
2,all aspiring music producer who mainly does h...,positive,all aspiring music producer who mainly does h...
3,funnily enough sex education was banned congre...,positive,funnily enough sex education was banned congre...
4,the real reason rjd don want support him that ...,positive,the real reason rjd don want support him that ...
...,...,...,...
2995,call lalit modi,neutral,call lalit modi
2996,why not delhi that way you save the money too,neutral,why not delhi that way you save the money too
2997,when america going get one these anti corrupti...,neutral,when america going get one these anti corrupti...
2998,kejriwal leading with 3500 votes over sheila,neutral,kejriwal leading with 3500 votes over sheila


## Definição do treino e do teste. KFold = 0

In [21]:
df['example_id'] = range(len(df))
df.head()

Unnamed: 0,clean_comment,category,text_cleaned,example_id
0,has ended but wait there see you next time do...,positive,has ended but wait there see you next time do...,0
1,dammit good and got good speechwriters let see...,positive,dammit good and got good speechwriters let see...,1
2,all aspiring music producer who mainly does h...,positive,all aspiring music producer who mainly does h...,2
3,funnily enough sex education was banned congre...,positive,funnily enough sex education was banned congre...,3
4,the real reason rjd don want support him that ...,positive,the real reason rjd don want support him that ...,4


In [22]:
from sklearn.model_selection import train_test_split, KFold
# Criando os folds usando KFold com 4 splits (folds)
kf = KFold(n_splits=4, shuffle=True, random_state=42)

# Adicionando uma coluna 'fold' ao DataFrame de treino para armazenar os valores dos folds
df['fold'] = -1

# Atribuindo o número do fold a cada linha do DataFrame
for fold, (train_indices, test_indices) in enumerate(kf.split(df)):
    df.loc[test_indices, 'fold'] = fold

train_df = df[df['fold'] != 0]
test_df = df[df['fold'] == 0]
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [23]:
test_df

Unnamed: 0,clean_comment,category,text_cleaned,example_id,fold
0,has ended but wait there see you next time do...,positive,has ended but wait there see you next time do...,0,0
1,this point even with the risk sounding like s...,positive,this point even with the risk sounding like s...,7,0
2,this will also hit the sector which has been e...,positive,this will also hit the sector which has been e...,12,0
3,how many cbi raids can anticipate now the next...,positive,how many cbi raids can anticipate now the next...,14,0
4,dat intense celebration,positive,dat intense celebration,29,0
...,...,...,...,...,...
745,mujhe sax milega,neutral,mujhe sax milega,2990,0
746,they clapping,neutral,they clapping,2991,0
747,welovemodi,neutral,welovemodi,2994,0
748,why not delhi that way you save the money too,neutral,why not delhi that way you save the money too,2996,0


In [24]:
# !pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer

# model = SentenceTransformer('bhadresh-savani/bert-base-go-emotion')
train_sentences = train_df['text_cleaned']
test_sentences = test_df['text_cleaned']

train_embeddings = model.encode(train_sentences)
test_embeddings = model.encode(test_sentences)

## Geração dos prompts

In [25]:
from sklearn.neighbors import NearestNeighbors
k = 1
neighbors_model = NearestNeighbors(n_neighbors=k, metric='cosine')
neighbors_model.fit(train_embeddings)

In [26]:
prompts = []

for i, test_embedding in enumerate(test_embeddings):
    distances, indices = neighbors_model.kneighbors([test_embedding], n_neighbors=k)

    input_output_pairs = []

    for j in range(k):
        most_similar_index = indices[0][j]
        similar_text = df['text_cleaned'][most_similar_index]
        similar_sentiment = df['category'][most_similar_index]
        input_output_pairs.append((similar_text, similar_sentiment))

    example_id = test_df['example_id'].iloc[i]
    test_sentence = test_df[test_df['example_id'] == example_id]['text_cleaned'].values[0]

    pair_prompts = []
    for j, pair in enumerate(input_output_pairs):
        pair_prompt = "Input: {}\nOutput: @{}".format(pair[0], pair[1])
        pair_prompts.append(pair_prompt)

    input_output_section = "\n\n".join(pair_prompts)

    prompt = '''
Análise de sentimentos.

{}

Input: {}
Output: @
'''.format(input_output_section, test_sentence)

    prompts.append(prompt)


In [27]:
# Salva os prompts em um arquivo dentro da pasta do Google Drive
from google.colab import files
file_path = '/content/drive/MyDrive/USP/10º semestre/TCC/Prompts/prompts_reddit_k=1_nn.txt'
with open(file_path, 'w') as file:
    for prompt in prompts:
        file.write(prompt + '\n')

# Baixa o arquivo para o seu computador local
files.download(file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
file_csv_test = '/content/drive/MyDrive/USP/10º semestre/TCC/Resultados/Dell/opt-iml-max-1.3b/test_responses_reddit_nn.csv'
test_df['category'].to_csv(file_csv_test, index=False)