In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
from tqdm.auto import tqdm
import json
import os
from huggingface_hub import login
from joblib import Parallel, delayed
import multiprocessing
import concurrent.futures
import psutil


In [62]:
# data_cleaned = pd.read_csv('Madrid_reviews_cleaned.csv')
data_cleaned = pd.read_parquet('Madrid_reviews_cleaned.parquet', engine='pyarrow')

In [63]:
data_cleaned['clean_text'] = data_cleaned['clean_text'].astype(str) 

In [64]:
data_cleaned['review_full'] = data_cleaned['review_full'].astype(str) 
data_cleaned.head()

Unnamed: 0.1,Unnamed: 0,parse_count,restaurant_name,rating_review,sample,review_id,title_review,review_preview,review_full,date,...,emotion_disgust,emotion_fear,emotion_negative,emotion_sadness,word_count,unique_word_count,mean_word_length,hashtag_count,mention_count,clean_text
0,0,1,Sushi_Yakuza,4,Positive,review_731778139,Good sushi option,"The menu of Yakuza is a bit of a lottery, some...","The menu of Yakuza is a bit of a lottery, some...",2019-12-10,...,0.058824,0.117647,0.117647,0.058824,75,55,4.24,0,0,menu yakuza bit lotteri plate realli good like...
1,10,11,Azotea_Forus_Barcelo,1,Negative,review_766657436,Light up your table at night,Check your bill when you cancel just in case y...,Check your bill when you cancel just in case y...,2020-08-23,...,0.0,0.166667,0.25,0.166667,38,35,4.368421,0,0,check bill cancel case get extra charg surpris...
2,11,12,Level_Veggie_Bistro,5,Positive,review_749493592,Delicious,I had the yuca profiteroles and the veggie bur...,I had the yuca profiteroles and the veggie bur...,2020-03-06,...,0.0,0.0,0.2,0.0,32,26,5.0,0,0,yuca profiterol veggi burger recommend server ...
3,12,13,Sto_Globo_Sushi_Room,5,Positive,review_772422246,Loved this place,A friend recommended this place as one of the ...,A friend recommended this place as one of the ...,2020-09-29,...,0.0,0.0,0.076923,0.0,85,62,4.270588,0,0,friend recommend place one best sushi ever tri...
4,13,14,Azotea_Forus_Barcelo,5,Positive,review_761855600,Amazing terrace in madrid,Amazing terrace in madrid - great atmosphere a...,Amazing terrace in madrid - great atmosphere a...,2020-07-27,...,0.0,0.0,0.0,0.0,21,20,4.714286,0,0,amaz terrac madrid great atmospher great wine ...


# Zero Shot Classification - Restaurant Rating

In [65]:
# Function to create the zero-shot classification pipeline
def create_classifier(model_name):
    return pipeline("zero-shot-classification", model=model_name)

# Function to classify a single piece of text
def classify_text(text, classifier, labels):
    result = classifier(text, labels)
    return result['labels'][0]  # The label with the highest score

# Check available memory
def get_available_memory():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / (1024 ** 2)  # Return memory usage in MB

# Function to classify text in parallel
def parallel_classify_texts(texts, classifier, labels, num_threads):
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        results = list(tqdm(executor.map(lambda text: classify_text(text, classifier, labels), texts), total=len(texts)))
    return results

# Process the DataFrame in chunks and save each chunk to a CSV
def process_and_save_chunks(data, chunk_size, num_threads, output_dir, model_name, labels):
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist
    chunks = [data[i:i + chunk_size] for i in range(0, data.shape[0], chunk_size)]
    
    classifier = create_classifier(model_name)

    for i, chunk in enumerate(chunks):
        chunk['predicted_label'] = parallel_classify_texts(chunk['review_full'], classifier, labels, num_threads)

        # Optionally, convert labels to ratings
        def convert_label_to_rating(label):
            label_to_rating = {
                "very bad review": 1,
                "bad review": 2,
                "average review": 3,
                "good review": 4,
                "very good review": 5
            }
            return label_to_rating[label]

        chunk['predicted_rating'] = chunk['predicted_label'].apply(convert_label_to_rating)

        # Save the chunk to a CSV file
        chunk.to_csv(os.path.join(output_dir, f'chunk_{i}.csv'), index=False)

        # Print progress
        print(f'Saved chunk {i} to CSV.')


def read_all_csvs_in_folder(folder_path):
    # List to hold the individual DataFrames
    df_list = []

    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file is a CSV file
        if filename.endswith('.csv'):
            # Construct the full file path
            file_path = os.path.join(folder_path, filename)
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            # Append the DataFrame to the list
            df_list.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(df_list, ignore_index=True)

    return combined_df

## Llama 2

We attempted to run llama both locally and on colab however, it would kill the kernel while loading the model. Given more computing power we would attempt other models.

## bart-large-mnli

In [66]:

model = "facebook/bart-large-mnli"
labels = ["very bad review", "bad review", "average review", "good review", "very good review"]
output_dir='processed_chunks_bart'
# process_and_save_chunks(data_cleaned, chunk_size=1000, num_threads=4, output_dir=output_dir, model_name=model, labels=labels)

In [67]:
bart = read_all_csvs_in_folder(output_dir)
bart['rating_diff']=bart['predicted_rating']-bart['rating_review']
bart['predicted_rating'].value_counts().sort_index()
percent_of_total = bart['predicted_rating'].value_counts(normalize=True).sort_index() 
percent_of_total

predicted_rating
1    0.006
2    0.046
3    0.019
4    0.520
5    0.409
Name: proportion, dtype: float64

In [68]:
rating_diff_counts = bart['rating_diff'].value_counts().sort_index()
percent_of_total = bart['rating_diff'].value_counts(normalize=True).sort_index() 
percent_of_total

rating_diff
-3    0.001
-2    0.001
-1    0.440
 0    0.508
 1    0.046
 2    0.004
Name: proportion, dtype: float64

## RoBERTa

In [69]:

model = "roberta-large-mnli"
labels = ["very bad review", "bad review", "average review", "good review", "very good review"]
output_dir='processed_chunks_roberta'

# process_and_save_chunks(data_cleaned, chunk_size=1000, num_threads=4, output_dir=output_dir, model_name=model, labels=labels)

In [70]:
roberta = read_all_csvs_in_folder(output_dir)
roberta['rating_diff']=roberta['predicted_rating']-roberta['rating_review']
roberta['predicted_rating'].value_counts().sort_index()
percent_of_total = roberta['predicted_rating'].value_counts(normalize=True).sort_index() 
percent_of_total

predicted_rating
1    0.009723
2    0.032383
3    0.052213
4    0.831106
5    0.074574
Name: proportion, dtype: float64

In [71]:
rating_diff_counts = roberta['rating_diff'].value_counts().sort_index()
percent_of_total = roberta['rating_diff'].value_counts(normalize=True).sort_index() 
percent_of_total

rating_diff
-4    0.000064
-3    0.001596
-2    0.008596
-1    0.688574
 0    0.262170
 1    0.032894
 2    0.004362
 3    0.001383
 4    0.000362
Name: proportion, dtype: float64

## DeBERTa

In [72]:

model = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
labels = ["very bad review", "bad review", "average review", "good review", "very good review"]
output_dir='processed_chunks_deberta'

# process_and_save_chunks(data_cleaned, chunk_size=1000, num_threads=4, output_dir=output_dir, model_name=model, labels=labels)

In [73]:
deberta = read_all_csvs_in_folder(output_dir)
deberta['rating_diff']=deberta['predicted_rating']-deberta['rating_review']
deberta['predicted_rating'].value_counts().sort_index()
percent_of_total = roberta['predicted_rating'].value_counts(normalize=True).sort_index() 
percent_of_total

predicted_rating
1    0.009723
2    0.032383
3    0.052213
4    0.831106
5    0.074574
Name: proportion, dtype: float64

In [74]:
rating_diff_counts = deberta['rating_diff'].value_counts().sort_index()
percent_of_total = deberta['rating_diff'].value_counts(normalize=True).sort_index() 
percent_of_total

rating_diff
-4    0.00006
-3    0.00386
-2    0.01224
-1    0.35848
 0    0.53956
 1    0.07870
 2    0.00518
 3    0.00162
 4    0.00030
Name: proportion, dtype: float64

## DistilBERT

In [75]:

model = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
labels = ["very bad review", "bad review", "average review", "good review", "very good review"]
output_dir='processed_chunks_distilbert'

# process_and_save_chunks(data_cleaned, chunk_size=1000, num_threads=4, output_dir=output_dir, model_name=model, labels=labels)

In [76]:
distilbert = read_all_csvs_in_folder(output_dir)
distilbert['rating_diff']=distilbert['predicted_rating']-distilbert['rating_review']
distilbert['predicted_rating'].value_counts().sort_index()
percent_of_total = roberta['predicted_rating'].value_counts(normalize=True).sort_index() 
percent_of_total

predicted_rating
1    0.009723
2    0.032383
3    0.052213
4    0.831106
5    0.074574
Name: proportion, dtype: float64

In [77]:
rating_diff_counts = distilbert['rating_diff'].value_counts().sort_index()
percent_of_total = distilbert['rating_diff'].value_counts(normalize=True).sort_index() 
percent_of_total 

rating_diff
-3    0.001333
-1    0.002333
 0    0.793000
 1    0.138667
 2    0.036000
 3    0.017000
 4    0.011667
Name: proportion, dtype: float64

## Flan T-5

In [78]:

model = "sjrhuschlee/flan-t5-base-mnli"
labels = ["very bad review", "bad review", "average review", "good review", "very good review"]
output_dir='processed_chunks_flan-t5'

# process_and_save_chunks(data_cleaned, chunk_size=1000, num_threads=4, output_dir=output_dir, model_name=model, labels=labels)

In [79]:
flan_t5 = read_all_csvs_in_folder(output_dir)
flan_t5['rating_diff']=flan_t5['predicted_rating']-flan_t5['rating_review']
flan_t5['predicted_rating'].value_counts().sort_index()
percent_of_total = roberta['predicted_rating'].value_counts(normalize=True).sort_index() 
percent_of_total

predicted_rating
1    0.009723
2    0.032383
3    0.052213
4    0.831106
5    0.074574
Name: proportion, dtype: float64

In [80]:
rating_diff_counts = flan_t5['rating_diff'].value_counts().sort_index()
percent_of_total = flan_t5['rating_diff'].value_counts(normalize=True).sort_index() 
percent_of_total 

rating_diff
-4    0.0010
-3    0.0310
-2    0.0065
-1    0.3395
 0    0.5325
 1    0.0650
 2    0.0155
 3    0.0080
 4    0.0010
Name: proportion, dtype: float64

## Comparing Models

In [81]:
dataframes = {
    'flan_t5': flan_t5,
    'distilBERT': distilbert,
    'deBERTa': deberta,
    'RoBERTa': roberta,
    'bart': bart
}

# Create an empty list to store the value_counts dataframes
count_dfs = []

for name, df in dataframes.items():
    counts = df['predicted_rating'].value_counts(normalize=True).sort_index()
    counts_df = counts.reset_index()
    counts_df.columns = ['rating', name]
    count_dfs.append(counts_df)

# Merge all count dataframes on the 'rating' column
result_df = count_dfs[0]
for count_df in count_dfs[1:]:
    result_df = result_df.merge(count_df, on='rating', how='outer')

# Fill NaN values with 0 (optional, if you expect missing ratings)
result_df = result_df.fillna(0)

print(result_df)

   rating  flan_t5  distilBERT  deBERTa   RoBERTa   bart
0       1   0.0020    0.000000  0.00494  0.009723  0.006
1       2   0.0555    0.002000  0.05592  0.032383  0.046
2       3   0.0025    0.000000  0.03060  0.052213  0.019
3       4   0.4555    0.003333  0.46050  0.831106  0.520
4       5   0.4845    0.994667  0.44804  0.074574  0.409


In [82]:
count_dfs = []

for name, df in dataframes.items():
    counts = df['rating_diff'].value_counts(normalize=True).sort_index()
    counts_df = counts.reset_index()
    counts_df.columns = ['rating_diff', name]
    count_dfs.append(counts_df)

# Merge all count dataframes on the 'rating' column
result_df = count_dfs[0]
for count_df in count_dfs[1:]:
    result_df = result_df.merge(count_df, on='rating_diff', how='outer')

# Fill NaN values with 0 (optional, if you expect missing ratings)
result_df = result_df.fillna(0)

print(result_df)

   rating_diff  flan_t5  distilBERT  deBERTa   RoBERTa   bart
0           -4   0.0010    0.000000  0.00006  0.000064  0.000
1           -3   0.0310    0.001333  0.00386  0.001596  0.001
2           -2   0.0065    0.000000  0.01224  0.008596  0.001
3           -1   0.3395    0.002333  0.35848  0.688574  0.440
4            0   0.5325    0.793000  0.53956  0.262170  0.508
5            1   0.0650    0.138667  0.07870  0.032894  0.046
6            2   0.0155    0.036000  0.00518  0.004362  0.004
7            3   0.0080    0.017000  0.00162  0.001383  0.000
8            4   0.0010    0.011667  0.00030  0.000362  0.000


## Challenges Encountered 

Oringinally the text given to the classifier was a cleaned text and the labels given were:
```python
['very bad', 'bad', 'neutral', 'good', 'very good']
```
However with this approache we encountered a problem, some rating were extremely misclassified. The idea of conducting zero-shot calssification is that the rating itself would be free from bias. However, scathing reviews were given a 4/5 which would be too inacurrate.



![Image Description](images/misclassification.jpeg)


The solution was to change the clean text for the review_full column and change the labels to:
```python
["very bad review", "bad review", "average review", "good review", "very good review"]
```
This allows the classifier to infer more context on the nature of the task and classifies more accurately. Below is an example of the differences in performance. rating_review is the user rating, dirty_text is the review_full column, clean_text is the processed text, and review_in_label in the version we used with review full and the above labels.

![Image Description](images/comparing_roberta_models.png)

# Zero Shot Classification - Restaurant Cuisine

In [83]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
import concurrent.futures
import os

# Function to create the zero-shot classification pipeline
def create_classifier(model_name):
    return pipeline("zero-shot-classification", model=model_name)

# Function to classify a single piece of text
def classify_text(row, classifier, labels):
    text = f"Restaurant Name: {row['restaurant_name']} - Review Title: {row['title_review']} - Review Full: {row['review_full']}"
    result = classifier(text, labels)
    return result['labels'][0]  # The label with the highest score

# Check available memory
def get_available_memory():
    import psutil
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / (1024 ** 2)  # Return memory usage in MB

# Define chunk size
chunk_size = 1000  # Adjust as needed

# Number of threads to use (adjust as needed)
num_threads = 4  # Use a small number of threads to limit CPU usage

# Wrap your pandas apply with tqdm for a progress bar
tqdm.pandas()

# Function to classify text in parallel
def parallel_classify_texts(df, classifier, labels, num_threads):
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        results = list(tqdm(executor.map(lambda row: classify_text(row, classifier, labels), [row for _, row in df.iterrows()]), total=len(df)))
    return results

# Process the DataFrame in chunks and save each chunk to a CSV
def process_and_save_chunks(data, chunk_size, num_threads, output_dir, file_name, model_name, labels):
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist
    chunks = [data[i:i + chunk_size] for i in range(0, data.shape[0], chunk_size)]
    
    classifier = create_classifier(model_name)

    for i, chunk in enumerate(chunks):
        chunk['predicted_cuisine'] = parallel_classify_texts(chunk, classifier, labels, num_threads)
        output_file_path = os.path.join(output_dir, f'{os.path.splitext(file_name)[0]}_chunk_{i}.csv')
        # Save the chunk to a CSV file
        chunk.to_csv(output_file_path, index=False)
        # Print progress
        print(f'Saved chunk {i} of {file_name} to CSV.')

# Process all CSV files in a directory
def process_all_csvs(input_dir, output_dir, model_name, labels):
    for file_name in os.listdir(input_dir):
        if file_name.endswith('.csv'):
            file_path = os.path.join(input_dir, file_name)
            print(f'Processing file: {file_path}')
            data = pd.read_csv(file_path)
            process_and_save_chunks(data, chunk_size, num_threads, output_dir, file_name, model_name, labels)



## RoBERTa

In [84]:
# Define input and output directories
input_dir = 'processed_chunks_roberta'
output_dir = 'processed_chunks_roberta_cuisine'

# Define model and labels
model_name = "roberta-large-mnli"
cuisine_labels = ["Italian", "Chinese", "Mexican", "Indian", "French", "Japanese", "American", "Thai", "Spanish", "Greek"]

# process_all_csvs(input_dir, output_dir, model_name, cuisine_labels)

In [85]:
roberta_cuisine = read_all_csvs_in_folder(output_dir)
percent_of_total = roberta_cuisine['predicted_cuisine'].value_counts(normalize=True).sort_index() 
percent_of_total

predicted_cuisine
American    0.047556
Chinese     0.007444
French      0.133111
Greek       0.039667
Indian      0.020833
Italian     0.066444
Japanese    0.017944
Mexican     0.154556
Spanish     0.502444
Thai        0.010000
Name: proportion, dtype: float64

In [86]:
pivot_table = pd.pivot_table(
        roberta_cuisine,
        index='predicted_cuisine',
        columns='restaurant_name',
        values='review_id',
        aggfunc='count',
        fill_value=0
    )
pivot_table

restaurant_name,A_Nora,A_vAnvera,Al_Son_de_Cuba,Albora,Alcaravea_castello,AlliOli_Valencian_Food,Alright,Amicis,Amparito_Roca,Antigua_Casa_de_la_Paella,...,Triana,Txirimiri,Verdejo,Vietnam_Restaurante,Vila_Brasil,Vinos_de_Bellota,YOUnique_Restaurant,Yakiniku_Rikyu,Yakitoro_by_Chicote,Zenith_Brunch_Cocktails
predicted_cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
American,0,0,0,0,0,0,49,9,0,0,...,0,0,0,0,0,1,0,0,0,138
Chinese,0,0,0,0,0,0,0,0,0,0,...,0,0,0,6,0,0,0,0,1,0
French,0,2,0,4,0,0,2,13,1,0,...,1,7,0,3,0,28,28,0,19,42
Greek,1,1,0,2,0,0,2,151,0,0,...,10,8,0,0,0,3,0,0,0,6
Indian,0,0,0,0,0,0,1,1,0,0,...,0,1,0,2,0,0,0,0,2,1
Italian,0,36,0,1,0,0,2,26,0,0,...,0,1,0,0,0,1,2,0,0,0
Japanese,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5,16,0
Mexican,0,0,0,0,0,0,4,18,3,5,...,9,41,0,0,11,14,0,0,11,1
Spanish,2,11,4,3,7,23,30,158,7,40,...,14,62,19,16,12,108,9,0,17,93
Thai,0,0,0,0,0,0,0,0,0,0,...,0,0,0,8,0,0,0,0,0,0


## Deberta

In [87]:
# Define input and output directories
input_dir = 'processed_chunks_deberta'
output_dir = 'processed_chunks_deberta_cuisine'

# Define model and labels
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
cuisine_labels = ["Italian", "Chinese", "Mexican", "Indian", "French", "Japanese", "American", "Thai", "Spanish", "Greek"]

# process_all_csvs(input_dir, output_dir, model_name, cuisine_labels)

In [88]:
roberta_cuisine = read_all_csvs_in_folder(output_dir)
percent_of_total = roberta_cuisine['predicted_cuisine'].value_counts(normalize=True).sort_index() 
percent_of_total

predicted_cuisine
American    0.031316
Chinese     0.004684
French      0.119289
Greek       0.006158
Indian      0.020605
Italian     0.136895
Japanese    0.024395
Mexican     0.020921
Spanish     0.630474
Thai        0.005263
Name: proportion, dtype: float64

In [89]:
pivot_table = pd.pivot_table(
        roberta_cuisine,
        index='predicted_cuisine',
        columns='restaurant_name',
        values='review_id',
        aggfunc='count',
        fill_value=0
    )
pivot_table

restaurant_name,99_Sushi_Bar,99_Sushi_Bar_Padre_Damian,A_Barra,A_vAnvera,Abaceria_Tapas_Lambuzo_Conchas,Ablanedo,Adrede,Albora,Alcaravea,Alcaravea_castello,...,Vila_Brasil,Villoldo,Vinitus_Gran_Via_Madrid,Vinos_de_Bellota,Vinoteca_Garcia_de_la_Navarra,Viridiana,Yakiniku_Rikyu,Yokaloka,Zalacain,Zenith_Brunch_Cocktails
predicted_cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
American,2,1,0,0,0,0,0,0,2,0,...,6,0,0,1,1,0,0,0,1,99
Chinese,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,3
French,1,0,6,0,0,0,15,1,1,0,...,1,1,0,50,5,1,0,1,41,65
Greek,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
Indian,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Italian,0,0,9,38,6,2,0,4,7,2,...,2,4,0,18,22,0,0,0,6,7
Japanese,36,20,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,5,40,0,6
Mexican,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,2
Spanish,13,8,38,13,47,5,24,5,79,5,...,99,9,64,85,92,9,0,9,53,119
Thai,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


# Exporting Data

In [90]:
roberta = read_all_csvs_in_folder('processed_chunks_roberta')
cuisine_roberta = read_all_csvs_in_folder('processed_chunks_roberta_cuisine')


deberta = read_all_csvs_in_folder('processed_chunks_deberta')
cuisine_deberta = read_all_csvs_in_folder('processed_chunks_deberta_cuisine')


roberta['model_used']='RoBERTa'
roberta['rating_diff']=roberta['predicted_rating']-roberta['rating_review']

cuisine_roberta['model_used']='RoBERTa'
cuisine_roberta['rating_diff']=cuisine_roberta['predicted_rating']-cuisine_roberta['rating_review']


deberta['model_used']='DeBERTa'
deberta['rating_diff']=deberta['predicted_rating']-deberta['rating_review']

cuisine_deberta['model_used']='DeBERTa'
cuisine_deberta['rating_diff']=cuisine_deberta['predicted_rating']-cuisine_deberta['rating_review']

pd.concat([roberta, deberta]).to_csv('data_final_no_cuisine.csv', index=False)
pd.concat([cuisine_roberta, cuisine_deberta]).to_csv('data_final_cuisine.csv', index=False)