In [2]:
# Load libraries
# !pip install -r /work/NLP_IMDb_Exam/requirements.txt
import numpy as np
import pandas as pd
import torch
import datasets
import evaluate
import seaborn as sns
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from sentence_transformers import SentenceTransformer


In [3]:
model_dict= {
    1 :{"name" : "Time_Vector",
        "huggingface" : "sentence-transformers/all-mpnet-base-v2",},
    }
# Choose a model for a pseudo-function
Chosen_Model = 1

data_path = f'../Data/MPNET_base/MPNET_base.csv'
active_dataframe = pd.read_csv(data_path)
embeddings = active_dataframe.iloc[:,0:-3]

In [4]:
data_path

'../Data/MPNET_base/MPNET_base.csv'

In [5]:
def positive_to_negative_vector(Positive, Negative):
    """
    Takes a positive and an negative data point and defines the vector spanning both vectors.
    """
    posneg_vector = Positive.mean().to_frame().T-Negative.mean().to_frame().T
    posneg_vector = pd.DataFrame(posneg_vector)
    return posneg_vector

In [6]:
# Define positive and negative average embeddings
embeddings = active_dataframe.iloc[:,0:-3]
positive = embeddings[active_dataframe['rating'] > 8] #positive ratings defined better ratings than 8 (9, 10)
negative = embeddings[active_dataframe['rating'] < 3] #negative ratings defined as worse than 3 (1, 2)
PosNeg_vector = positive_to_negative_vector(Positive = positive, Negative = negative)

# Determine the minimum length
min_length = min(len(positive), len(negative))

# Truncate the longer dataframe
positive = positive.iloc[:min_length]
negative = negative.iloc[:min_length]

print(positive.shape)
print(negative.shape)

(11733, 768)
(11733, 768)


In [7]:
Positive_GPT_Reviews = np.array([
    "An innovative thriller with a fresh take on a classic trope. The visuals are stunning, and the narrative keeps you on edge.",
    "A heartfelt drama that explores the complexities of modern relationships with a touch of humor.",
    "A bold and ambitious project that pushes the boundaries of storytelling, even if it stumbles at times.",
    "Packed with adrenaline-pumping action and a surprisingly emotional core.",
    "A visually dazzling experience that occasionally prioritizes style over substance.",
    "A poignant exploration of identity and belonging, delivered with wit and sensitivity.",
    "An immersive world brought to life with groundbreaking effects and memorable performances.",
    "A compelling mix of drama and suspense, though it falters in its final act.",
    "A powerful meditation on grief and resilience, featuring standout performances.",
    "An entertaining blend of comedy and drama that offers more depth than expected.",
    "A fast-paced adventure that keeps you guessing until the very end.",
    "A dark and gritty tale that subverts genre expectations with finesse.",
    "A character-driven narrative that tugs at the heartstrings while offering profound insights.",
    "A satirical comedy that doesn’t shy away from tackling tough subjects with biting humor.",
    "An emotionally charged story that explores the sacrifices made in pursuit of a dream.",
    "A visually stunning piece with a hauntingly beautiful score.",
    "A sharp and cleverly written script that keeps the audience engaged throughout.",
    "A unique blend of genres that creates a truly unforgettable experience.",
    "A gripping tale of survival with breathtaking cinematography.",
    "A thoughtful exploration of morality wrapped in an intense thriller."
])

Negative_GPT_Reviews = np.array([
    "A timeless classic that set the standard for its genre, filled with unforgettable performances.",
    "A groundbreaking work that influenced countless filmmakers and continues to inspire.",
    "A charming story that captures the essence of a bygone era with grace and style.",
    "A riveting drama that explores universal themes with remarkable depth.",
    "An iconic performance that elevates a simple story into a masterpiece.",
    "A visually striking film that pioneered techniques still in use today.",
    "A poignant reflection on human nature, delivered with subtlety and nuance.",
    "A thrilling tale that keeps audiences captivated from start to finish.",
    "A beautifully crafted story that has stood the test of time.",
    "An unforgettable musical score paired with stunning visuals makes this a true classic.",
    "A deeply emotional narrative that resonates across generations.",
    "A tale of love and sacrifice that remains relevant and touching.",
    "A technically brilliant film that revolutionized the industry.",
    "A cultural milestone that continues to influence popular culture.",
    "A gripping story brought to life with stellar performances and masterful direction.",
    "An inspiring story of hope and perseverance against all odds.",
    "A daring and innovative film that broke new ground in its day.",
    "A nostalgic journey into the golden age of cinema, filled with memorable moments.",
    "A perfect balance of humor, drama, and heart that has enchanted audiences for decades.",
    "A stunning achievement in storytelling, unmatched in its era."
])

In [8]:
transformer_model = SentenceTransformer(model_dict[Chosen_Model]["huggingface"], device="cuda") # Choose the best sentence transformer according to https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
Positive_GPT_Embeddings = pd.DataFrame(transformer_model.encode(Positive_GPT_Reviews))
Negative_GPT_Embeddings = pd.DataFrame(transformer_model.encode(Negative_GPT_Reviews))
PosNeg_GPT= positive_to_negative_vector(Positive= Positive_GPT_Embeddings, Negative= Negative_GPT_Embeddings)
PosNeg_GPT

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.015399,-0.010045,-0.003328,0.002149,-0.010337,0.006522,-0.014409,-0.0059,0.011436,0.01878,...,0.024741,-0.017696,-0.007649,0.001909,-0.005377,0.006594,0.013565,0.003206,0.01395,0.006252


In [9]:
def project_matrix_to_vector(matrix, vector):
    """Compute the projection of a matrix onto the space spanned by the vector
    Args:
        vector: ndarray of dimension (D, 1), the vector spanning D dimensions that you want to project upon.
        matrix: ndarray of dimension (D, M), the matrix consisting of M vectors that you want to map to the subspace spanned by the vector.
    
    Returns:
        p: projection of matrix onto the subspac spanned by the columns of vector; size (D, 1)
    """
    m = matrix.to_numpy() # Turn into a matrix
    v = vector.to_numpy()[0] #Turn into a numpy array

    # Compute v dot v (denominator)
    v_dot_v = np.dot(v, v)

    # Compute projection of each row of m onto v
    projection = np.outer(np.dot(m, v) / v_dot_v, v)
    projection = pd.DataFrame(projection)

    return projection

In [10]:
def express_matrix_by_vector(matrix, vector):
    """Compute the projection of a matrix onto the space spanned by the vector
    Args:
        vector: ndarray of dimension (D, 1), the vector spanning D dimensions that you want to project upon.
        matrix: ndarray of dimension (D, M), the matrix consisting of M vectors that you want to map to the subspace spanned by the vector.
    
    Returns:
        projection: projection of matrix onto the subspac spanned by the columns of vector; size (D, 1)
        projection_in_1D_subspace: Each embedding projected onto 1 dimensional subspace spanned by input vector.
    """
    unit_vector = vector / np.linalg.norm(vector) # Find the unit vector for interpretatbility by dividing with its norm
    projection = project_matrix_to_vector(matrix, vector) # Find projections, so we can find lengths by finding relations in first dimension
    projection_in_1D_subspace = projection.iloc[:,0]/unit_vector.iloc[:,0][0] # Location in subspace

    return projection, projection_in_1D_subspace

# Saving outputs for future use

In [11]:
# Save GPT positive-negative corrected embedding:
# Save positive negative corrected embedding:
projected_variance, projection_in_1D_subspace = express_matrix_by_vector(matrix=embeddings, vector=PosNeg_GPT)
# projected_variance = project_matrix_to_vector(matrix=embeddings, vector=posneg_vector)
posneg_GPT_corrected_embeddings = pd.DataFrame(embeddings.to_numpy()-projected_variance.to_numpy())
posneg_GPT_corrected_embeddings['posneg_subspace'] = projection_in_1D_subspace
posneg_GPT_corrected_embeddings['rating'] = active_dataframe['rating']
posneg_GPT_corrected_embeddings['average_rating'] = active_dataframe['average_rating']
save_corrected = f'../Data/{model_dict[Chosen_Model]["name"]}/{model_dict[Chosen_Model]["name"]}_GPT_corrected.csv'
posneg_GPT_corrected_embeddings.to_csv(save_corrected, index=False)