In [1]:
# Load libraries
!pip install -r /work/NLP_IMDb_Exam/requirements.txt
import numpy as np
import pandas as pd
import torch
import datasets
import evaluate
import seaborn as sns
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from sentence_transformers import SentenceTransformer


Defaulting to user installation because normal site-packages is not writeable
Collecting datasets (from -r /work/NLP_IMDb_Exam/requirements.txt (line 1))
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate (from -r /work/NLP_IMDb_Exam/requirements.txt (line 2))
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting matplotlib (from -r /work/NLP_IMDb_Exam/requirements.txt (line 3))
  Downloading matplotlib-3.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy (from -r /work/NLP_IMDb_Exam/requirements.txt (line 4))
  Downloading numpy-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pandas (from -r /work/NLP_IMDb_Exam/requirements.txt (line 5))
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting scikit-learn (from -r /work/NLP_IMDb_Exam/requirements.txt (line 6))
  Downloading scikit_lear

2024-12-31 09:32:24.724267: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735633944.738701    1333 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735633944.743083    1333 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-31 09:32:24.759669: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model_dict= {
    1 :{"name" : "MiniLM_L6",
        "huggingface" : "sentence-transformers/all-MiniLM-L6-v2",},
    2 :{"name" : "MPNET_base",
        "huggingface" : 'sentence-transformers/all-mpnet-base-v2',},
    3 :{"name" : "Instructor",
        "huggingface" : "hkunlp/instructor-large",},
    }
# Choose a model for a pseudo-function
Chosen_Model = 3

data_path = f'../Data/{model_dict[Chosen_Model]["name"]}/{model_dict[Chosen_Model]["name"]}.csv'
active_dataframe = pd.read_csv(data_path)

In [3]:
data_path

'../Data/Instructor/Instructor.csv'

In [9]:
def positive_to_negative_vector(Positive, Negative):
    """
    Takes a positive and an negative data point and defines the vector spanning both vectors.
    """
    posneg_vector = Positive.mean().to_frame().T-Negative.mean().to_frame().T
    posneg_vector = pd.DataFrame(posneg_vector)
    return posneg_vector

# Generalise embeddings
transformer_model = SentenceTransformer(model_dict[Chosen_Model]["huggingface"], device="cuda")
if Chosen_Model < 3:
    def my_encoder(corpus):
        embeddings_df = pd.DataFrame(transformer_model.encode(
            corpus))
        return embeddings_df

if Chosen_Model == 3:
    def my_encoder(corpus):
        embeddings_df = pd.DataFrame(transformer_model.encode(
            corpus,
            prompt="Represent the movie review for classifying the corresponding movie rating: "))
        return embeddings_df

In [10]:
# Define positive and negative average embeddings
embeddings = active_dataframe.iloc[:,0:-3]
positive = embeddings[active_dataframe['rating'] > 8] #positive ratings defined better ratings than 8 (9, 10)
negative = embeddings[active_dataframe['rating'] < 3] #negative ratings defined as worse than 3 (1, 2)
PosNeg_vector = positive_to_negative_vector(Positive = positive, Negative = negative)

# Determine the minimum length
min_length = min(len(positive), len(negative))

# Truncate the longer dataframe
positive = positive.iloc[:min_length]
negative = negative.iloc[:min_length]

print(positive.shape)
print(negative.shape)

(11733, 768)
(11733, 768)


In [11]:
import pandas as pd
splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'test': 'plain_text/test-00000-of-00001.parquet'}
yelp_reviews = pd.read_parquet("hf://datasets/fancyzhx/yelp_polarity/" + splits["train"])
Positive_yelp_Reviews = yelp_reviews[yelp_reviews['label'] == 1].drop(columns = 'label')['text'].values[0:1000]
Negative_yelp_Reviews = yelp_reviews[yelp_reviews['label'] == 0].drop(columns = 'label')['text'].values[0:1000]



Positive_yelp_embeddings = my_encoder(Positive_yelp_Reviews)
Negative_yelp_embeddings = my_encoder(Negative_yelp_Reviews)
PosNeg_yelp = positive_to_negative_vector(Positive = Positive_yelp_embeddings, Negative = Negative_yelp_embeddings)
PosNeg_yelp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.004887,-0.011106,0.00481,0.00773,-0.010989,-0.013034,0.006403,-0.002936,0.017168,0.0108,...,0.000735,0.005432,-0.011673,-0.000681,-0.001782,0.013933,-0.002455,0.005975,0.006735,-0.005863


In [12]:
Positive_GPT_Reviews = np.array([
    "This film redefines the action genre, delivering heart-pounding sequences and jaw-dropping stunts. A must-watch for adrenaline junkies!",
    "A deeply moving tale that tugs at the heartstrings and leaves you with a renewed sense of hope. Truly unforgettable.",
    "The visual effects are nothing short of breathtaking. Every frame is a work of art that immerses you completely.",
    "The cast's chemistry and brilliant performances bring the story to life in the most authentic and engaging way.",
    "A cinematic masterpiece with impeccable direction that seamlessly blends drama, suspense, and emotion.",
    "The characters are so well-developed and relatable that you can't help but get invested in their journey.",
    "An absolute laugh riot from start to finish! The witty dialogue and hilarious antics are sure to leave you in stitches.",
    "The world-building in this movie is unparalleled. Every detail is carefully crafted, creating a universe you'll never want to leave.",
    "A powerful and inspiring story that leaves you motivated to chase your dreams and overcome any obstacles.",
    "The music complements the story beautifully, elevating emotional moments and adding depth to every scene.",
    "This movie keeps you on the edge of your seat with its clever twists and turns. A gripping ride you won't forget.",
    "A refreshing take on a familiar theme, offering a perspective that feels both innovative and deeply resonant.",
    "Perfect for audiences of all ages, this movie delivers laughter, lessons, and love in equal measure.",
    "The cinematography is a visual feast, capturing both the grandeur of the setting and the intimacy of the characters' emotions.",
    "A delightful story that warms your heart and reminds you of the simple joys in life.",
    "An exhilarating journey filled with excitement, danger, and triumph. An epic adventure for the ages.",
    "The actors' raw and genuine performances make you forget you're watching a movie. Pure artistry.",
    "This film masterfully combines humor, drama, and action, making it a rollercoaster of emotions from start to finish.",
    "A vibrant celebration of culture and tradition, beautifully portrayed with authenticity and reverence.",
    "A film that transcends time with its universal themes and captivating storytelling. Destined to become a classic."
])

Negative_GPT_Reviews = np.array([
    "This movie lacks any sense of direction, leaving the audience confused and frustrated.",
    "The storyline is painfully predictable, offering nothing new or exciting.",
    "Poorly written characters make it impossible to care about what happens to them.",
    "The acting is wooden and emotionless, making every scene feel forced and lifeless.",
    "A complete waste of stunning visuals due to a hollow and uninspired plot.",
    "The humor feels forced and falls flat, making the comedy aspect unbearable.",
    "Pacing issues plague the movie, with some parts dragging endlessly while others feel rushed.",
    "The dialogue is cringeworthy and unnatural, detracting from the overall experience.",
    "A disappointing sequel that fails to capture the magic of the original.",
    "The special effects are overused, overshadowing the weak storytelling.",
    "This movie tries too hard to be edgy but ends up being obnoxious and shallow.",
    "The soundtrack is forgettable and adds no value to the film.",
    "An overstuffed plot with too many subplots that go nowhere.",
    "The lack of chemistry between the leads makes their relationship unconvincing.",
    "The ending is abrupt and unsatisfying, leaving more questions than answers.",
    "An unoriginal rehash of better films, lacking any creativity or fresh ideas.",
    "The movie’s tone is inconsistent, making it hard to take seriously.",
    "Unnecessarily long runtime with scenes that add nothing to the story.",
    "The action sequences are chaotic and poorly choreographed, making them hard to follow.",
    "An underwhelming experience that fails to leave any lasting impression."
])

In [13]:
Positive_GPT_Embeddings= my_encoder(Positive_GPT_Reviews)
Negative_GPT_Embeddings= my_encoder(Negative_GPT_Reviews)
PosNeg_GPT= positive_to_negative_vector(Positive= Positive_GPT_Embeddings, Negative= Negative_GPT_Embeddings)
PosNeg_GPT

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.024184,-0.010036,-0.000144,-0.005453,-0.001093,-0.013167,0.005935,-0.027534,0.017758,0.022016,...,-0.000742,0.002989,-0.005215,0.005257,0.006716,0.006693,-0.003185,0.004785,6.4e-05,0.00096


In [14]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

print("Norm and GPT:")
#  Assume df1 and df2 are your two DataFrames with shape (1, 768)
cosine_sim = cosine_similarity(PosNeg_vector.values, PosNeg_GPT.values)

# The result will be a 2D array with shape (1, 1), so extract the single value
cosine_similarity_value = cosine_sim[0, 0]

print(f"Cosine Similarity: {cosine_similarity_value}")


print("Norm and Yelp:")
#  Assume df1 and df2 are your two DataFrames with shape (1, 768)
cosine_sim = cosine_similarity(PosNeg_vector.values, PosNeg_yelp.values)

# The result will be a 2D array with shape (1, 1), so extract the single value
cosine_similarity_value = cosine_sim[0, 0]


print(f"Cosine Similarity: {cosine_similarity_value}")

print("GPT and Yelp:")
#  Assume df1 and df2 are your two DataFrames with shape (1, 768)
cosine_sim = cosine_similarity(PosNeg_GPT.values, PosNeg_yelp.values)

# The result will be a 2D array with shape (1, 1), so extract the single value
cosine_similarity_value = cosine_sim[0, 0]


print(f"Cosine Similarity: {cosine_similarity_value}")



Norm and GPT:
Cosine Similarity: 0.8259100787869493
Norm and Yelp:
Cosine Similarity: 0.8412883138018823
GPT and Yelp:
Cosine Similarity: 0.7354161739349365


In [15]:
def project_matrix_to_vector(matrix, vector):
    """Compute the projection of a matrix onto the space spanned by the vector
    Args:
        vector: ndarray of dimension (D, 1), the vector spanning D dimensions that you want to project upon.
        matrix: ndarray of dimension (D, M), the matrix consisting of M vectors that you want to map to the subspace spanned by the vector.
    
    Returns:
        p: projection of matrix onto the subspac spanned by the columns of vector; size (D, 1)
    """
    m = matrix.to_numpy() # Turn into a matrix
    v = vector.to_numpy()[0] #Turn into a numpy array

    # Compute v dot v (denominator)
    v_dot_v = np.dot(v, v)

    # Compute projection of each row of m onto v
    projection = np.outer(np.dot(m, v) / v_dot_v, v)
    projection = pd.DataFrame(projection)

    return projection

In [16]:
def express_matrix_by_vector(matrix, vector):
    """Compute the projection of a matrix onto the space spanned by the vector
    Args:
        vector: ndarray of dimension (D, 1), the vector spanning D dimensions that you want to project upon.
        matrix: ndarray of dimension (D, M), the matrix consisting of M vectors that you want to map to the subspace spanned by the vector.
    
    Returns:
        projection: projection of matrix onto the subspac spanned by the columns of vector; size (D, 1)
        projection_in_1D_subspace: Each embedding projected onto 1 dimensional subspace spanned by input vector.
    """
    unit_vector = vector / np.linalg.norm(vector) # Find the unit vector for interpretatbility by dividing with its norm
    projection = project_matrix_to_vector(matrix, vector) # Find projections, so we can find lengths by finding relations in first dimension
    projection_in_1D_subspace = projection.iloc[:,0]/unit_vector.iloc[:,0][0] # Location in subspace

    return projection, projection_in_1D_subspace

# Saving outputs for future use

In [17]:
# Save positive negative corrected embedding:
projected_variance, projection_in_1D_subspace = express_matrix_by_vector(matrix=embeddings, vector=PosNeg_vector)
### 
posneg_corrected_embeddings = pd.DataFrame(embeddings.to_numpy()-projected_variance.to_numpy())
posneg_corrected_embeddings['posneg_subspace'] = projection_in_1D_subspace
posneg_corrected_embeddings['rating'] = active_dataframe['rating']
posneg_corrected_embeddings['average_rating'] = active_dataframe['average_rating']
save_corrected = f'../Data/{model_dict[Chosen_Model]["name"]}/{model_dict[Chosen_Model]["name"]}_corrected.csv'
posneg_corrected_embeddings.to_csv(save_corrected, index=False)


In [18]:
# Save projected data for regression in workbook 07
projected_path = f'../Data/{model_dict[Chosen_Model]["name"]}/{model_dict[Chosen_Model]["name"]}_projected.csv'
projected_variance = pd.DataFrame(projected_variance)
projected_variance['posneg_subspace']  = projection_in_1D_subspace
projected_variance['rating'] = active_dataframe['rating']
projected_variance.to_csv(projected_path, index=False)

In [19]:
# Save GPT positive-negative corrected embedding:

# Save positive negative corrected embedding:
projected_variance, projection_in_1D_subspace = express_matrix_by_vector(matrix=embeddings, vector=PosNeg_GPT)
# projected_variance = project_matrix_to_vector(matrix=embeddings, vector=posneg_vector)
posneg_GPT_corrected_embeddings = pd.DataFrame(embeddings.to_numpy()-projected_variance.to_numpy())
posneg_GPT_corrected_embeddings['posneg_subspace'] = projection_in_1D_subspace
posneg_GPT_corrected_embeddings['rating'] = active_dataframe['rating']
posneg_GPT_corrected_embeddings['average_rating'] = active_dataframe['average_rating']
save_corrected = f'../Data/{model_dict[Chosen_Model]["name"]}/{model_dict[Chosen_Model]["name"]}_GPT_corrected.csv'
posneg_GPT_corrected_embeddings.to_csv(save_corrected, index=False)

In [20]:
# Save yelp positive-negative corrected embedding:

# Save positive negative corrected embedding:
projected_variance, projection_in_1D_subspace = express_matrix_by_vector(matrix=embeddings, vector=PosNeg_yelp)
# projected_variance = project_matrix_to_vector(matrix=embeddings, vector=posneg_vector)
posneg_yelp_corrected_embeddings = pd.DataFrame(embeddings.to_numpy()-projected_variance.to_numpy())
posneg_yelp_corrected_embeddings['posneg_subspace'] = projection_in_1D_subspace
posneg_yelp_corrected_embeddings['rating'] = active_dataframe['rating']
posneg_yelp_corrected_embeddings['average_rating'] = active_dataframe['average_rating']
save_corrected = f'../Data/{model_dict[Chosen_Model]["name"]}/{model_dict[Chosen_Model]["name"]}_yelp_corrected.csv'
posneg_yelp_corrected_embeddings.to_csv(save_corrected, index=False)

In [21]:
# Renaming column names for easier merging
PosNeg_GPT.columns = PosNeg_vector.columns
PosNeg_yelp.columns  = PosNeg_vector.columns

# Concatinate vectors
PosNeg_Vectors = pd.concat([PosNeg_vector, PosNeg_GPT, PosNeg_yelp], ignore_index=True)
PosNeg_Vectors['Dataset'] = ["IMDb", "GPT", "Yelp"]
file_path = f'../Data/{model_dict[Chosen_Model]["name"]}/{model_dict[Chosen_Model]["name"]}_PosNeg_Vectors.csv'
PosNeg_Vectors.to_csv(file_path, index=False)
