In [1]:
!pip install pandas
!pip install scipy
!pip install transformers
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu



Collecting numpy>=1.19.0 (from spacy)
  Using cached numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Using cached numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.0.2 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.0.2
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [3

In [2]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import re
import spacy
import urllib
import csv

nlp = spacy.load("en_core_web_sm")



MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

task='sentiment'
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]


def preprocess_text(text):
    """
    Preprocess the input text using the provided spaCy model.

    Args:
        text (str): The input text to preprocess.
        nlp_model: The preloaded spaCy language model.

    Returns:
        str: The preprocessed text.
    """
    # Process the text with the spaCy model
    doc = nlp(text)
    processed_text = doc.text.strip()
    return processed_text

def get_sentiment(text) -> dict:
    try:
        text = str(text)
        text = preprocess_text(text)
    
    
        if len(text) == 0:
            return {
                "positive": 0.0,
                "neutral": 0.0,
                "negative": 0.0
            }
    
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        ranking = np.argsort(scores)
        ranking = ranking[::-1]

        sentiment={}

        for i in range(scores.shape[0]):
            l = labels[ranking[i]]
            s = scores[ranking[i]]
            sentiment[l]=s
        
        return sentiment
        
    except:
        return {
            "positive": 0.0,
            "neutral": 0.0,
            "negative": 0.0
        }


In [None]:
import pandas as pd
import os
column = "review"

data_folder = "reviews-ubisoft"

def getCSVs(folder) -> list[str]:
    return [f for f in os.listdir(folder) if f.endswith('.csv')]

for file in getCSVs(data_folder):
    print(f"Starting with {data_folder}/{file}")

    df = pd.read_csv(f"{data_folder}/{file}", delimiter=",", on_bad_lines="skip", engine="python")
    
    print(f"Starting with {file}")
    
    df["sentiment"] = df.apply(lambda x: get_sentiment(x[column]), axis=1)
    df["negative"] = df["sentiment"].apply(lambda x: x["negative"])
    df["neutral"] = df["sentiment"].apply(lambda x: x["neutral"])
    df["positive"] = df["sentiment"].apply(lambda x: x["positive"])
    
    df.to_csv(f"{data_folder}/{file}", index=False)
    print(f"Done with {file}")

Starting with reviews-ubisoft/reviews_2290180.csv
Starting with reviews_2290180.csv
Done with reviews_2290180.csv
Starting with reviews-ubisoft/reviews_3035570.csv
Starting with reviews_3035570.csv
Done with reviews_3035570.csv
Starting with reviews-ubisoft/reviews_220240.csv
Starting with reviews_220240.csv
Done with reviews_220240.csv
Starting with reviews-ubisoft/reviews_33230.csv
Starting with reviews_33230.csv
Done with reviews_33230.csv
Starting with reviews-ubisoft/reviews_365590.csv
Starting with reviews_365590.csv
Done with reviews_365590.csv
Starting with reviews-ubisoft/reviews_2231380.csv
Starting with reviews_2231380.csv
Done with reviews_2231380.csv
Starting with reviews-ubisoft/reviews_916440.csv
Starting with reviews_916440.csv
Done with reviews_916440.csv
Starting with reviews-ubisoft/reviews_359550.csv
Starting with reviews_359550.csv
Done with reviews_359550.csv
Starting with reviews-ubisoft/reviews_2751000.csv
Starting with reviews_2751000.csv
Done with reviews_2751