In [2]:
import os, sys, json
import scipy.stats
import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset


In [3]:
# get the data from huggingface
dataset = load_dataset("yelp_review_full")
dataset_test = dataset["test"]



In [4]:
def test_significance(df_train, metric1, metric2):
    correlation, p_value = scipy.stats.spearmanr(df_train[metric1], df_train[metric2])
    print(f'Correlation between {metric1} and {metric2}')
    print(f"Correlation coefficient: {correlation}")
    print(f"P-value: {p_value}")
    significant = False
    if p_value < 0.05:
        significant = True 
    return {
                    "metric1": metric1,
                    "metric2": metric2,
                    "correlation": correlation, 
                    "p-value": p_value, 
                    "signficant": significant
                }

def plot(df_train, column1, column2, prefix='Raw'):
    plt.figure(figsize=(8, 6))  
    # plt.scatter(df_train[column1], df_train[column2], color='blue', alpha=0.6) 
    sns.regplot(x = df_train[column1], y = df_train[column2], color='green', scatter_kws={'alpha':0.5})
    plt.title(f'Scatter Plot of {column1} vs {column2} on {prefix}')  
    plt.xlabel(f'{column1}')  
    plt.ylabel(f'{column2}')  
    plt.grid(True) 
    plt.savefig(f'./{column1}_{column2}_{prefix}_correlation.png', dpi=300)

### Prepare ground truth

In [5]:
# convert to json
data = []
for i in range(len(dataset_test)):
    data.append({"text": dataset_test[i]["text"], "gt_label": dataset_test[i]["label"]})

In [6]:
data_df = {x['text']: x for x in data}

### Append your baseline results

In [7]:
path_predicted_labels = "/home/anmola/assignments_hw/sentiment-cartography/v2_dir_512_large/inference_results.json"
path_logit_scores = "/home/anmola/assignments_hw/sentiment-cartography/single_v2_dir_512_large/inference_results.json"

In [8]:
with open(path_predicted_labels, "r") as f:
    predicted_df = json.load(f)
with open(path_logit_scores, "r") as f:
    logit_df = json.load(f)

In [9]:
def find_matched_pairs(s1, arr_s):
    # find indexes of all elements in arr_s that s1 is a substring of
    matched_pairs = []
    for i, s in enumerate(arr_s):
        if s1 in s:
            matched_pairs.append(i)
    assert(len(matched_pairs) > 0)
    assert(len(matched_pairs)==1)
    return matched_pairs

In [10]:
all_keys = list(data_df.keys())
for elem in tqdm.tqdm(predicted_df):
    curr_review_text = elem["review"]
    if curr_review_text in data_df:
        matched_review_text = curr_review_text
    else:
        matched_key_idx = find_matched_pairs(curr_review_text, all_keys)
        matched_review_text = all_keys[matched_key_idx[0]]
    assert("multi_class_predicted_label" not in data_df[matched_review_text])
    data_df[matched_review_text]["multiclass_predicted_label"] = elem["predicted_label"]

100%|██████████| 50000/50000 [00:12<00:00, 3923.89it/s]


In [11]:
for elem in tqdm.tqdm(logit_df):
    curr_review_text = elem["review"]
    if curr_review_text in data_df:
        matched_review_text = curr_review_text
    else:
        matched_key_idx = find_matched_pairs(curr_review_text, all_keys)
        matched_review_text = all_keys[matched_key_idx[0]]
    assert("single_class_logit_score" not in data_df[matched_review_text])
    logits = elem["logit_scores"]
    softmax_scores = scipy.special.softmax(logits)
    assert(len(softmax_scores) == 2)
    data_df[matched_review_text]["single_class_logit_score"] = softmax_scores[1]
    #break

100%|██████████| 50000/50000 [00:13<00:00, 3803.53it/s]


In [12]:
### Load Diddee's api scores
api_scores_path = "/home/anmola/assignments_hw/sentiment-cartography/data/didee_google_results.jsonl"
api_scores_data = []
with open(api_scores_path, "r") as f:
    for line in f:
        api_scores_data.append(json.loads(line))

In [13]:
api_scores_data[0]

{'input': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.",
 'score': 0.6520000100135803,
 'magnitude': 3.815000057220459,
 'label': 4}

In [15]:
# for elem in api_scores_data:
#     curr_review_text = elem["input"]
#     if curr_review_text in data_df:
#         matched_review_text = curr_review_text
#     else:
#         matched_key_idx = find_matched_pairs(curr_review_text, all_keys)
#         matched_review_text = all_keys[matched_key_idx[0]]
#     assert("google_api_score" not in data_df[matched_review_text])
#     data_df[matched_review_text]["google_api_score"] = elem["score"]
#     assert("google_api_magnitude" not in data_df[matched_review_text])
#     data_df[matched_review_text]["google_api_magnitude"] = elem["magnitude"]
   

## Merge Kwang's data

In [29]:
polar_coordinates_csv_path = "/home/anmola/assignments_hw/sentiment-cartography/data/kwanghee_polar_coordinates.csv"
polar_coordinates_df = pd.read_csv(polar_coordinates_csv_path)

# retain only those values in `split=test`
polar_coordinates_df = polar_coordinates_df[polar_coordinates_df["split"] == "test"]
# rename columns
rename_mapping = {'pos':"pos_loss", "neg": "neg_loss", "ori": "base_loss"}
for key in rename_mapping:
    polar_coordinates_df = polar_coordinates_df.rename(columns={key: rename_mapping[key]})

In [30]:
polar_coordinates_df.columns

Index(['pos_loss', 'neg_loss', 'base_loss', 'text', 'label', 'split'], dtype='object')

In [31]:
polar_coordinates_df

Unnamed: 0,pos_loss,neg_loss,base_loss,text,label,split
650000,3.637170,3.423840,4.001163,I got 'new' tires from them and within two wee...,0,test
650001,3.356954,3.033727,4.223260,Don't waste your time. We had two different p...,0,test
650002,3.785630,3.524564,4.136008,All I can say is the worst! We were the only 2...,0,test
650003,3.353036,2.869883,3.647495,I have been to this restaurant twice and was d...,0,test
650004,3.176172,2.873306,3.618195,Food was NOT GOOD at all! My husband & I ate h...,0,test
...,...,...,...,...,...,...
699995,3.798547,3.646442,3.949870,Just wanted to write a review to chip in with ...,0,test
699996,2.362278,3.172081,3.590026,Great ambience. Great drinks. Great food. I lo...,4,test
699997,3.373806,3.480679,4.198580,I have been to the other Monks locations so I ...,3,test
699998,3.848746,3.583591,4.204350,Don't go here. I know you might want to try i...,1,test


In [32]:
def _get_polar(pos_loss, neg_loss, base_loss):
    pos = np.exp(pos_loss - base_loss)
    neg = np.exp(neg_loss- base_loss)
    r = np.sqrt((pos ** 2 + neg ** 2))
    theta = np.arctan(pos / neg)
    theta_in_degrees = np.degrees(theta)
    return r, theta, theta_in_degrees

In [33]:
# add new columns ie polar_radius, polar_theta
for row in polar_coordinates_df.iterrows():
    r, theta, theta_in_degrees = _get_polar(row[1]["pos_loss"], row[1]["neg_loss"], row[1]["base_loss"])
    polar_coordinates_df.at[row[0], "polar_radius"] = r
    polar_coordinates_df.at[row[0], "polar_theta"] = theta
    polar_coordinates_df.at[row[0], "polar_theta_deg"] = theta_in_degrees

In [34]:
polar_coordinates_df

Unnamed: 0,pos_loss,neg_loss,base_loss,text,label,split,polar_radius,polar_theta,polar_theta_deg
650000,3.637170,3.423840,4.001163,I got 'new' tires from them and within two wee...,0,test,0.893337,0.891263,51.065621
650001,3.356954,3.033727,4.223260,Don't waste your time. We had two different p...,0,test,0.519094,0.944269,54.102620
650002,3.785630,3.524564,4.136008,All I can say is the worst! We were the only 2...,0,test,0.889150,0.914473,52.395453
650003,3.353036,2.869883,3.647495,I have been to this restaurant twice and was d...,0,test,0.875254,1.018090,58.332252
650004,3.176172,2.873306,3.618195,Food was NOT GOOD at all! My husband & I ate h...,0,test,0.799081,0.934568,53.546802
...,...,...,...,...,...,...,...,...,...
699995,3.798547,3.646442,3.949870,Just wanted to write a review to chip in with ...,0,test,1.133103,0.861159,49.340771
699996,2.362278,3.172081,3.590026,Great ambience. Great drinks. Great food. I lo...,4,test,0.720631,0.418643,23.986467
699997,3.373806,3.480679,4.198580,I have been to the other Monks locations so I ...,3,test,0.655790,0.732063,41.944125
699998,3.848746,3.583591,4.204350,Don't go here. I know you might want to try i...,1,test,0.883173,0.916449,52.508650


In [35]:
# convert to json
polar_coordinates_data = []
all_cols = list(polar_coordinates_df.columns)
for i in range(len(polar_coordinates_df)):
    polar_coordinates_data.append({x: polar_coordinates_df.iloc[i][x] for x in all_cols})

In [36]:
polar_coordinates_data[0]

{'pos_loss': 3.6371703147888175,
 'neg_loss': 3.423840284347534,
 'base_loss': 4.001162528991699,
 'text': 'I got \'new\' tires from them and within two weeks got a flat. I took my car to a local mechanic to see if i could get the hole patched, but they said the reason I had a flat was because the previous patch had blown - WAIT, WHAT? I just got the tire and never needed to have it patched? This was supposed to be a new tire. \\nI took the tire over to Flynn\'s and they told me that someone punctured my tire, then tried to patch it. So there are resentful tire slashers? I find that very unlikely. After arguing with the guy and telling him that his logic was far fetched he said he\'d give me a new tire \\"this time\\". \\nI will never go back to Flynn\'s b/c of the way this guy treated me and the simple fact that they gave me a used tire!',
 'label': 0,
 'split': 'test',
 'polar_radius': 0.8933369212390614,
 'polar_theta': 0.8912632147066474,
 'polar_theta_deg': 51.06562063795302}

In [44]:
for elem in polar_coordinates_data:
    curr_review_text = elem["text"]
    if curr_review_text in data_df:
        matched_review_text = curr_review_text
    else:
        matched_key_idx = find_matched_pairs(curr_review_text, all_keys)
        matched_review_text = all_keys[matched_key_idx[0]]
    cols_transfer = ["polar_radius", "polar_theta", "polar_theta_deg", 'pos_loss', 'neg_loss', 'base_loss']
    for col in cols_transfer:
        # assert(col not in data_df[matched_review_text])
        data_df[matched_review_text][col] = elem[col]


In [46]:
data_df[matched_review_text]

{'text': "Buffet was recently open after renovation so my husband and I are thinking it should be pretty good....wrong. We arrived at 8:30 am for what we thought was a Saturday brunch offering champs, nope. Price is $21.95 but on Monday - Friday is $18.95...same food so why the increase? I can go to the Fiesta in Henderson with champs and better food for $8.99 Sat/Sun but we wanted to try something we thought was going to be an upgrade....lol what a joke. \\n\\n\\nFood: everything I tried looked and tasted like food was leftover from the day before and reheated. \\n\\nShrimps were all water logged, the crab legs had a brownish color to them with an odd taste. \\n\\nPastries none..all offerings were donuts. They did have a fresh crepe bar, but then again looked very dry. I didn't want to stand on line for crepe paper. \\n\\nOverall I'm rating this one star because I have no choice.",
 'gt_label': 0,
 'multiclass_predicted_label': 1,
 'single_class_logit_score': 0.00017183152926799749,
 

In [43]:
elem['poss_loss']

KeyError: 'poss_loss'