In [6]:
from transformers import pipeline
import torch
import json
import os
import pandas as pd
from typing import List
import plotly.express as px

In [None]:
RELEVANT_FIELDS = ['text','createdAt']
RELEVANT_AUTHOR_FIELDS = ['userName','description']

def extract_df_from_json_tweets_data(path_tweets):
    dict_list = []

    # For each JSON
    for filename in os.listdir(path_tweets):
        if filename[-4:] != 'json':
            continue

        with open(os.path.join(path_tweets, filename), 'rt') as f:
            curr_json_list = json.load(f)

        # For each single tweet in a JSON
        for curr_json in curr_json_list:
            # Extract only relevant fields from tweet
            relevant_json = {k:v for k,v in curr_json.items() if k in RELEVANT_FIELDS}
            relevant_json_author = {f'author_{k}':v for k,v in relevant_json['author'].items() if k in RELEVANT_AUTHOR_FIELDS}

            # Delete semi-structured author field in `relevant_json`
            del relevant_json['author']

            # Merging the two dataframes and specifying original file
            new_dict = {**relevant_json, **relevant_json_author}
            new_dict['src_file'] = filename
            dict_list.append(new_dict)

    df = pd.DataFrame(dict_list)
    return df

df = extract_df_from_json_tweets_data(PATH_TWEETS)

In [5]:

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
print(torch.cuda.get_device_name())

cuda
NVIDIA GeForce RTX 3060 Laptop GPU


In [None]:
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")

In [None]:
def text_classification(classifier,texts, candidate_labels: List[str]= ["desvalorização", "valorização", "neutro"]):

    outputs = []

    for text in texts:
        output = classifier(text, candidate_labels, multi_label=False)
        outputs.append(output["labels"][0])

    return outputs

In [None]:
lista_opinioes = text_classification(classifier, df["texts"], ["desvalorização", "valorização", "neutro"])

In [None]:
df["opiniao"] = lista_opinioes

In [None]:
contagem = df["opiniao"].value_counts()
fig = px.bar(contagem, 
            x= contagem.index, 
            title= "número de textos que indicam cada tendência"
            )
fig.show()