In [12]:
import pandas as pd, re

file_path = "/content/puregym_reviews_raw_data.csv"
df = pd.read_csv(file_path, encoding='utf-8-sig')

df.columns = df.columns.str.strip()
df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce").fillna(0).astype(int)

lab_to_sentiment = {1: "Negative", 2: "Negative", 3: "Neutral", 4: "Positive", 5: "Positive"}
df["Sentiment"] = df["Rating"].apply(lambda x: lab_to_sentiment.get(x, "Unknown"))

def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^\w\s.,!?-]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
    return text

if "content" in df.columns:
    df["content"] = df["content"].apply(clean_text)

# Remove "Unknown" sentiments and duplicates
df = df[df["Sentiment"] != "Unknown"]  # Remove "Unknown" sentiment
df = df.drop_duplicates()  # Remove duplicate rows

output_file = "cleaned_puregym_data.csv"
df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Sentiment analysis completed. Cleaned file saved: {output_file}")


Sentiment analysis completed. Cleaned file saved: cleaned_puregym_data.csv
