In [3]:
# importing all necessary libraries
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from pymongo import MongoClient
import os

# ensureing NLTK data is downloaded once
nltk.download('punkt')

# loading the CSV
df = pd.read_csv("Canada Goose Reviews.csv")  # adjust path if needed

# cleaning the 'Review' column
def clean_review(text):
    if not isinstance(text, str):
        return ""  # If it's not a string, return an empty string (this handles non-text entries)
    
    text = text.strip()  # Remove leading/trailing whitespaces
    # You can add more cleaning steps here, such as removing punctuation, converting to lowercase, etc.
    return text

# applying the cleaning function to the 'Review' column
df['Review'] = df['Review'].apply(clean_review)

# handling missing values in 'Review' column
# Option 1: Remove rows with missing 'Review'
df = df.dropna(subset=['Review'])

# preprocessing the 'Review' column
def preprocess_review(text):
    try:
        sentences = sent_tokenize(text)
        tokenized = [word_tokenize(sentence) for sentence in sentences]
        return tokenized
    except Exception as e:
        print(f"Error processing review: {e}")
        return []

# applying the preprocessing function to the 'Review' column
df['preprocessed'] = df['Review'].apply(preprocess_review)

# connecting to MongoDB Atlas
uri = os.getenv("MONGO_URI", "mongodb+srv://bishwajitdutta02:bishwajitdutta02@review-analysis.daha3ba.mongodb.net/?retryWrites=true&w=majority&appName=review-analysis")
client = MongoClient(uri)

# defining database and collection
db = client["CanadaGooseReviews"]
collection = db["PreprocessedData"]

# inserting into MongoDB
records = df[['Review', 'preprocessed']].to_dict(orient='records')

# inserting in batches incase dataset is large
batch_size = 1000
for i in range(0, len(records), batch_size):
    batch = records[i:i + batch_size]
    collection.insert_many(batch)

print(f"✅ {len(records)} records successfully inserted into MongoDB!")

df.to_csv('preprocessed_reviews.csv', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wwwbi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✅ 1260 records successfully inserted into MongoDB!
