In [None]:
import wordcloud
from wordcloud import WordCloud
import json
from collections import defaultdict
from array import array
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#DATA LOADING

import json, gzip
from pathlib import Path

def iter_json_objects(path):
    opener = gzip.open if str(path).endswith('.gz') else open
    with opener(path, 'rt', encoding='utf-8') as f:
        first = f.read(1)
        f.seek(0)
        if first == '[':
            # big array
            try:
                import ijson
                for obj in ijson.items(f, 'item'):
                    yield obj
            except ImportError:
                data = json.load(f)  # may be heavy
                for obj in data:
                    yield obj
        else:
            # NDJSON
            for line in f:
                line = line.strip()
                if line:
                    yield json.loads(line)

# usage
records = list(iter_json_objects('fashion_products_dataset.json'))


FileNotFoundError: [Errno 2] No such file or directory: 'fashion_products_dataset.json'

In [None]:
# === Minimal preprocessing with NLTK tokenizer (Option A) =====================
# Run this cell once. If re-running, downloads will be no-ops.

#DATA PREPARATION

import re, numpy as np, nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Ensure NLTK data is available (quiet if already downloaded)
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt", quiet=True)

# Some environments also need this; harmless if not required
try:
    nltk.data.find("tokenizers/punkt_tab")
except LookupError:
    try:
        nltk.download("punkt_tab", quiet=True)
    except Exception:
        pass  # not present on all NLTK builds

try:
    nltk.data.find("corpora/stopwords")
except LookupError:
    nltk.download("stopwords", quiet=True)

stop_words = set(stopwords.words("english"))
stemmer     = PorterStemmer()

def _num(x):
    if x is None: return np.nan
    if isinstance(x,(int,float)): return float(x)
    s = str(x).replace(",","").strip().lower()
    if s.endswith("% off"): s = s.split("%")[0]
    if s in ("true","false"): return float(s=="true")
    try: return float(s)
    except: return np.nan

def _details_to_text(dets):
    if not dets: return ""
    parts = []
    for d in dets:
        if isinstance(d, dict):
            for k,v in d.items():
                parts.append(f"{k} {v}")
    return " ".join(parts)

def product_processor(rec):
    # 1) gather text
    title   = rec.get("title","")
    desc    = rec.get("description","")
    details = _details_to_text(rec.get("product_details"))
    raw_txt = f"{title} {desc} {details}"

    # 2) tokenize -> lowercase -> keep alnum -> stopwords -> stem
    toks = word_tokenize(raw_txt)
    toks = [t.lower() for t in toks if t.isalnum()]
    toks = [stemmer.stem(t) for t in toks if t not in stop_words and len(t) > 2]
    cleaned = " ".join(toks)

    # 3) return processed row (preserve all required fields)
    return {
        "pid": rec.get("pid",""),
        "title": title,
        "description": desc,
        "brand": rec.get("brand",""),
        "category": rec.get("category",""),
        "sub_category": rec.get("sub_category",""),
        "product_details": rec.get("product_details") or [],
        "seller": rec.get("seller",""),
        "out_of_stock": _num(rec.get("out_of_stock")),
        "selling_price": _num(rec.get("selling_price")),
        "discount": _num(rec.get("discount")),
        "actual_price": _num(rec.get("actual_price")),
        "average_rating": _num(rec.get("average_rating")),
        "url": rec.get("url",""),
        "tokens": cleaned
    }
# ============================================================================

# Example usage (assumes you already defined iter_json_objects(...)):
# processed_rows = []
# for rec in iter_json_objects("fashion_products_dataset.json"):
#     processed_rows.append(product_processor(rec))
# import pandas as pd
# pd.DataFrame(processed_rows).to_csv("cache/processed_products.csv", index=False)


In [None]:
# Process -> list of dicts
processed_data = [product_processor(rec) for rec in iter_json_objects("fashion_products_dataset.json")]

# DataFrame in one go
products_df = pd.DataFrame(processed_data)

# Use product ID as index
products_df.set_index("pid", inplace=True)

# Preview like they did
products_df.head(4)


In [None]:
# Product text length plot

# Aux column for the length of each
vocab_df = products_df.copy()
vocab_df['Wordcount'] = products_df['tokens'].fillna('').apply(lambda x: len(x.split()))

plt.figure(figsize=(10,6))
sns.histplot(vocab_df['Wordcount'], bins=22, kde=True)

plt.title('Product text length plot')
plt.xlabel('Number of words')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Getting info about our vocabulary (products)

# All product tokens in a single list of words
whole_content = products_df['tokens'].fillna('').str.split().explode().tolist()

# Counting appearances
word_counts = Counter(whole_content)

# Converting to dict
vocab_dict = dict(word_counts)
print('Product vocabulary size is', len(vocab_dict))

# Sorting dictionary items to get the top 5 appearing words
sorted_dict = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True)
print('\nThe top 5 most appearing words are:')
for item in sorted_dict[:5]:
    print(item)


In [None]:
# Generating a word cloud for product tokens
words_wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(vocab_dict)

plt.figure(figsize=(14,8))
plt.imshow(words_wc, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
# Creating a new dataframe sorted by highest rating
rated_df = products_df.copy()
rated_sorted = rated_df.sort_values(by='average_rating', ascending=False)

print('The following are the Top 5 Highest-Rated Products:')
rated_sorted.reset_index().drop(columns=['pid','url'], errors='ignore').head(5)


In [None]:
top_brands = df['brand'].value_counts().head(10)
top_sellers = df['seller'].value_counts().head(10)
