In [1]:
import pandas as pd
import os, ast, re, torch
import numpy as np
from datasets import Dataset
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [2]:
dataset_path = "../../data/processed/"

dfs = []

for df in os.listdir(dataset_path):
    if df[:4] == "FULL":
        dfs.append(pd.read_csv(os.path.join(dataset_path, df), encoding="utf-8", encoding_errors="replace"))
df = pd.concat(dfs)
df.head()

                        Date                      AAL  \
0  2012-01-03 00:00:00+00:00  {'val_adj_close': 4.83}   
1  2012-01-04 00:00:00+00:00  {'val_adj_close': 4.74}   
2  2012-01-05 00:00:00+00:00  {'val_adj_close': 5.16}   
3  2012-01-06 00:00:00+00:00  {'val_adj_close': 5.28}   
4  2012-01-09 00:00:00+00:00  {'val_adj_close': 5.39}   

                      AAME                                               AAON  \
0  {'val_adj_close': 1.83}                             {'val_adj_close': 5.4}   
1  {'val_adj_close': 1.82}                            {'val_adj_close': 5.36}   
2  {'val_adj_close': 1.88}                            {'val_adj_close': 5.33}   
3  {'val_adj_close': 1.84}                            {'val_adj_close': 5.27}   
4  {'val_adj_close': 1.87}  {'val_adj_close': 5.26, 'noticias': [{'noticia...   

                       AAPL                      AAXJ  \
0   {'val_adj_close': 12.4}   {'val_adj_close': 43.2}   
1  {'val_adj_close': 12.47}  {'val_adj_close': 42.88}   

In [3]:
mask = df.apply(lambda col: col.str.contains(r"noticias"))

indices = np.argwhere(mask.values == True).tolist()
indices = list(map(tuple, indices))

idx = []
titulares = []

for i in range(len(indices)):
    cell = ast.literal_eval(df.iloc[indices[i]])
    noticias = []
    temp = cell["noticias"].copy()
    
    for noticia in temp:
        x = noticia["noticia"]
        if x not in noticias:
            noticias.append(x)
        else:
            cell["noticias"].remove(noticia)

    df.iloc[indices[i]] = str(cell)
    idx.extend([(indices[i], x) for x in range(len(cell["noticias"]))])
    titulares.extend(noticias)

df.to_csv("full_dataset_removed_duplicates.csv", encoding="utf-8")

In [4]:
df.shape

(2260, 1150)

In [56]:
X = pd.DataFrame({"idx": idx, "text": titulares})
X.to_csv("dataset_to_test.csv", encoding="utf-8")

In [47]:
def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = """.strip()

X_eval = pd.DataFrame(X.apply(generate_test_prompt, axis=1), 
                       columns=["text"])

eval_data = Dataset.from_pandas(X_eval)

In [45]:
def predict(X_eval, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_eval))):
        prompt = X_eval.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens = 1, 
                        temperature = 0.0,
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")
    return y_pred

In [52]:
model_path = "model/"
compute_dtype = getattr(torch, "float16")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path,
                                          torch_dtype=compute_dtype,
                                          return_dict=False,
                                          low_cpu_mem_usage=True,
                                          device_map=device
                                         )

working on cpu


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 7/7 [01:47<00:00, 15.34s/it]


In [54]:
y_pred = predict(X_eval[:10], model, tokenizer)

  0%|                                                                                           | 0/10 [01:38<?, ?it/s]


KeyboardInterrupt: 