# Setup

In [1]:
from openai import OpenAI
import pandas as pd
from src.utils import (
    evaluate_category,
    evaluate_incident,
    evaluate_sentiment,
    load_data_preprocessed,
    get_final_sentiment
)
from src.incidents_and_categories import (
    get_category_from_gpt,
    get_incident_from_gpt
)
from src.actaware_data_preprocessing import (
    load_original_data
)
from src.config import OPEN_AI_KEY

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\akaga\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
client = OpenAI(api_key=OPEN_AI_KEY)

In [3]:
chosen_dataset="processed_human_data" #"not_preprocessed_data", "processed_4o_data", "processed_regex_data", "processed_human_data"

In [2]:
number_of_raw_data=10
number_of_preprocessed_data=10

# Load data

## Load data - preselected and preprocessed

In [5]:
data_gt, data_df = load_data_preprocessed(chosen_dataset)
data_df_small = data_df[:number_of_preprocessed_data]
data_gt_small = data_gt[:number_of_preprocessed_data]

### Category and incident recognition

In [6]:
category_for_chosen=get_category_from_gpt(data_df_small, client=client)
incident_for_chosen=get_incident_from_gpt(data_df_small, client=client)
incident_score_list, incident_score_mean=evaluate_incident(data_df_small, incident_for_chosen[0])

In [7]:
print(f"Score for category: {evaluate_category(data_gt_small, category_for_chosen[0])}, score for incident: {incident_score_mean}")

Score for category: 0.75, score for incident: 0.5084992170333862


In [8]:
incident_for_chosen

(['Workplace bullying',
  'Labor dispute incident',
  'Worker exploitation allegations',
  'Workplace tensions',
  'Worker protests Amazon',
  'Workplace racism',
  'BBC diversity spending',
  'Sexism in finance',
  'Environmental protest',
  'Carbon removal agreement'],
 [np.float64(84.4),
  np.float64(77.6),
  np.float64(82.0),
  np.float64(97.6),
  np.float64(83.4),
  np.float64(70.9),
  np.float64(43.9),
  np.float64(99.9),
  np.float64(41.1),
  np.float64(100.0)])

In [9]:
incident_score_list

[0.48982951045036316,
 0.48585546016693115,
 0.3569790720939636,
 0.4798167645931244,
 0.5508518815040588,
 0.46387800574302673,
 0.7214587926864624,
 0.44444540143013,
 0.48493921756744385,
 0.6069380640983582]

### Sentiment

In [10]:
sentiment_final=get_final_sentiment(data_df_small)

Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
print(f"Score for sentiment: {evaluate_sentiment(data_gt_small, sentiment_final)}")

Score for sentiment: 0.375


In [12]:
sentiment_final

['negative',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'negative',
 'neutral',
 'neutral',
 'neutral',
 'positive']

### Final answer of the models

In [13]:
df_answers = pd.DataFrame(columns=['Article', 'Company', 'Category', 'Incident', 'Sentiment'])
df_answers['Article']=data_df_small
df_answers['Company']=data_gt_small['Their_Matched_Companies']
df_answers['Category']=category_for_chosen[0]
df_answers['Incident']=incident_for_chosen[0]
df_answers['Sentiment']=sentiment_final
df_answers

Unnamed: 0,Article,Company,Category,Incident,Sentiment
0,People work in the Amazon Fulfillment Center i...,Amazon,human_employee_rights,Workplace bullying,negative
1,A federal agency is seeking to force Starbucks...,Starbucks,human_employee_rights,Labor dispute incident,neutral
2,You might have seen a new energy drink on Amaz...,Amazon,human_employee_rights,Worker exploitation allegations,neutral
3,The BBC's director-general has tried to calm t...,BBC,diversity_equity_inclusion,Workplace tensions,positive
4,Amazon is running a competition to give its br...,Amazon,human_employee_rights,Worker protests Amazon,neutral
5,Nihal Arthanayake says he saw 'a lack of diver...,BBC,diversity_equity_inclusion,Workplace racism,negative
6,The BBC has been slammed after it emerged it i...,BBC,diversity_equity_inclusion,BBC diversity spending,neutral
7,The boss of Aviva has revealed senior white ma...,Aviva,diversity_equity_inclusion,Sexism in finance,neutral
8,The British Museum has secured a £50m donation...,BP,environment,Environmental protest,neutral
9,Carbon removal solutions provider Carbonfuture...,Microsoft,environment,Carbon removal agreement,positive


## Load data - original

In [3]:
original_data=load_original_data(first_rows=number_of_raw_data)
original_data

Unnamed: 0,ContentRaw,MatchedCompanies
0,The BBC is being urged to drop singer Olly Ale...,[BBC]
1,"Olly Alexander, the UK's new Eurovision act, h...",[Eurovision]
2,A union representative claims they were barred...,[McDonald's]
3,The Duke invited a gun smuggler and an alleged...,[Goldman Sachs]
4,Airbnb is set to use artificial intelligence t...,[Airbnb]
5,Nihal Arthanayake says he saw 'a lack of diver...,[BBC]
6,Two women have emerged as front-runners to tak...,[BP]
7,A top Tesla executive has revealed the company...,[Tesla]
8,DETROIT – General Motors has cut the size of i...,[GM]
9,Thanks for signing up to the Morning Headline...,[BP]


### Category and incident recognition

In [15]:
category_for_original=get_category_from_gpt(original_data['ContentRaw'], client=client)
incident_for_original=get_incident_from_gpt(original_data['ContentRaw'], client=client)

In [16]:
incident_for_original

(['Eurovision controversy incident',
  'Eurovision controversy',
  'Rat infestation allegations',
  'Fraud, Smuggling, Settlement',
  'Unauthorized parties ban',
  'Workplace racism',
  'Corporate leadership change',
  'Wireless charging development',
  'Dealership buyouts',
  'Environmental protest'],
 [np.float64(40.3),
  np.float64(20.2),
  np.float64(98.8),
  np.float64(89.3),
  np.float64(86.5),
  np.float64(74.0),
  np.float64(43.7),
  np.float64(99.0),
  np.float64(99.2),
  np.float64(61.3)])

In [17]:
incident_score_list, incident_score_mean=evaluate_incident(original_data['ContentRaw'], incident_for_chosen[0])
print(f"Score for incident: {incident_score_mean}")

Score for incident: 0.28473458141088487


### Sentiment

In [18]:
sentiment_final_original=get_final_sentiment(original_data['ContentRaw'])

Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Final answer of the model

In [19]:
df_answers_original = pd.DataFrame(columns=['Article', 'Company', 'Category', 'Incident', 'Sentiment'])
df_answers_original['Article']=original_data['ContentRaw']
df_answers_original['Company']=original_data['MatchedCompanies']
df_answers_original['Category']=category_for_original[0]
df_answers_original['Incident']=incident_for_original[0]
df_answers_original['Sentiment']=sentiment_final_original
df_answers_original

Unnamed: 0,Article,Company,Category,Incident,Sentiment
0,The BBC is being urged to drop singer Olly Ale...,[BBC],political_and_religious_views,Eurovision controversy incident,negative
1,"Olly Alexander, the UK's new Eurovision act, h...",[Eurovision],political_and_religious_views,Eurovision controversy,negative
2,A union representative claims they were barred...,[McDonald's],human_employee_rights,Rat infestation allegations,negative
3,The Duke invited a gun smuggler and an alleged...,[Goldman Sachs],corporate_transparency,"Fraud, Smuggling, Settlement",negative
4,Airbnb is set to use artificial intelligence t...,[Airbnb],corporate_transparency,Unauthorized parties ban,neutral
5,Nihal Arthanayake says he saw 'a lack of diver...,[BBC],diversity_equity_inclusion,Workplace racism,negative
6,Two women have emerged as front-runners to tak...,[BP],diversity_equity_inclusion,Corporate leadership change,negative
7,A top Tesla executive has revealed the company...,[Tesla],environment,Wireless charging development,positive
8,DETROIT – General Motors has cut the size of i...,[GM],business_involvement,Dealership buyouts,neutral
9,Thanks for signing up to the Morning Headline...,[BP],environment,Environmental protest,positive
