# Load dependencies

In [12]:
# Load libraries
#!pip install -r /work/NLP_IMDb_Exam/requirements.txt
import numpy as np
import pandas as pd
import torch
import datasets
import evaluate
import seaborn as sns
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from sentence_transformers import SentenceTransformer

# Create Functions

In [None]:
import re
def clean_string(raw_string):
    """
    Takes in a raw_string and returns the cleaned version.
    """
    clean_text = re.sub(r'<br/><br/>', ' ', raw_string)  # Remove breaks
    clean_text = re.sub(r'/', ' ', clean_text)  # Remove slashes   
    clean_text = re.sub(r'[^a-zA-Z0-9 ]', '', clean_text).lower()  # Remove special characters and lowercase
    return clean_text

def save_embedding_with_information(embedding, IMDb_subset, filename):
    # Save embedding for future:
    embedding = pd.DataFrame(embedding) # Pandas dataframe of embeddings
    embedding[['rating', 'average_rating']] = IMDb_subset[['rating', 'average_rating']].apply(pd.to_numeric)
    embedding['review'] = IMDb_subset['review']
    embedding.to_csv(f'../Data/{filename}/{filename}.csv', index = False)

In [19]:
# Import dataset:
IMDb_dataframe = pd.read_csv('/work/NLP_IMDb_Exam/Data/review_dataframe.csv')
IMDb_dataframe = IMDb_dataframe[IMDb_dataframe['rating'] != "Null"] # Remove nulls

In [15]:
IMDb_subset = IMDb_dataframe.sample(n=100000, random_state=42, ignore_index=True)
IMDb_subset['review'] = IMDb_subset['review'].apply(clean_string)
IMDb_subset.to_csv('../Data/review_dataframe_subset.csv', index=False)
subset_corpus= IMDb_subset['review'].tolist()

In [16]:
# Prepare models:
# Load best small model for sentence embedding
Mini_Model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda")
# Load best basic model for sentence embedding:
MPNET_Model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device="cuda") # Choose the best sentence transformer according to https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
# Load instructor model, for giving a promt for embedding:
Instructor_Model = SentenceTransformer("hkunlp/instructor-large", device="cuda")



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# Embed the mini model:
Mini_Embedding= Mini_Model.encode(subset_corpus)
save_embedding_with_information(Mini_Embedding, IMDb_subset, "MiniLM_L6")

In [25]:
MPNET_Embedding = MPNET_Model.encode(subset_corpus)
save_embedding_with_information(MPNET_Embedding, IMDb_subset, "MPNET_base")

In [26]:
Instructor_Embedding = Instructor_Model.encode(
    subset_corpus,
    prompt="Represent the movie review for classifying the corresponding movie rating: ",
)
save_embedding_with_information(Instructor_Embedding, IMDb_subset, "Instructor")

In [None]:
# Save all models
filenames = ["IMDb_Embedding-MiniLM_L6", "IMDb_Embedding-MPNET_base", "IMDb_Embdding_Instructor_rating"]
embeddings = [Mini_Embedding, MPNET_Embedding, Instructor_Embedding]

for filename, embedding in zip(filenames, embeddings):
    save_embedding_with_information(embedding=embedding, IMDb_subset=IMDb_subset, filename=filename)
