<a href="https://colab.research.google.com/github/janhavidhamak/NLP_Project/blob/main/Phase1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:

# Phase 1: Data Preprocessing

!pip install langdetect googletrans==4.0.0-rc1 nltk pandas

import pandas as pd
import re
from langdetect import detect
from googletrans import Translator
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download NLTK assets
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# 1. Load dataset
df = pd.read_csv("amazon_reviews.csv")

# Ensure there’s a column named "Review"
df.columns = [col.strip().lower() for col in df.columns]
if "review" not in df.columns:
    raise ValueError("The dataset must have a 'Review' column.")

df = df.dropna(subset=["review"]).reset_index(drop=True)
print(f" Loaded {len(df)} reviews")

# 2. Detect language
def detect_language(text):
    try:
        return detect(str(text))
    except:
        return "unknown"

df["language"] = df["review"].apply(detect_language)

# 3. Translate non-English reviews (non-Transformer)
translator = Translator()

def translate_to_english(text, lang):
    if lang != "en" and lang != "unknown":
        try:
            translated = translator.translate(text, src=lang, dest='en')
            return translated.text
        except:
            return text
    else:
        return text

df["translated_review"] = df.apply(lambda x: translate_to_english(x["review"], x["language"]), axis=1)

# 4. Initial Cleaning
def clean_text(text):
    text = str(text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["clean_text"] = df["translated_review"].apply(clean_text)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_tokens(text):
    tokens = word_tokenize(text)
    filtered = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(filtered)

df["processed_text"] = df["clean_text"].apply(preprocess_tokens)


df.to_csv("processed_reviews.csv", index=False)
print(" Phase 1 completed! File saved as 'processed_reviews.csv'")

df.head(10)




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Loaded 130 reviews
✅ Phase 1 completed! File saved as 'processed_reviews.csv'


Unnamed: 0,review,language,translated_review,clean_text,processed_text
0,I use pretty strong facewash so my face can ge...,en,I use pretty strong facewash so my face can ge...,i use pretty strong facewash so my face can ge...,use pretty strong facewash face get bit dry sh...
1,I use this every night. It moisturizes great f...,en,I use this every night. It moisturizes great f...,i use this every night it moisturizes great fo...,use every night moisturizes great sensitive skin
2,Really easy to apply. Smooth and absorbs quickly,en,Really easy to apply. Smooth and absorbs quickly,really easy to apply smooth and absorbs quickly,really easy apply smooth absorbs quickly
3,will always be buying this it’s light weight a...,en,will always be buying this it’s light weight a...,will always be buying this its light weight an...,always buying light weight greasy
4,"This is a very nice and hydrating moisturizer,...",en,"This is a very nice and hydrating moisturizer,...",this is a very nice and hydrating moisturizer ...,nice hydrating moisturizer however product use...
5,I am turning 41 and have noticed my skin becom...,en,I am turning 41 and have noticed my skin becom...,i am turning and have noticed my skin becoming...,turning noticed skin becoming dry wrinkly rese...
6,Great for nighttime use. It's on the thin side...,en,Great for nighttime use. It's on the thin side...,great for nighttime use its on the thin side a...,great nighttime use thin side nongreasy pump d...
7,This is perfect for me to use after using my R...,en,This is perfect for me to use after using my R...,this is perfect for me to use after using my r...,perfect use using retina night ingredient hyla...
8,This is my go to love this stuff. Very gental ...,en,This is my go to love this stuff. Very gental ...,this is my go to love this stuff very gental o...,go love stuff gental skin
9,"Light moisturizer, better for daytime",en,"Light moisturizer, better for daytime",light moisturizer better for daytime,light moisturizer better daytime
