In [13]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

nltk.download('wordnet')

df = pd.read_csv("WomensClothingE-CommerceReviews.csv")  # Replace "your_dataset.csv" with the path to your CSV file

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):

        tokens = word_tokenize(text)

        tokens = [word.lower() for word in tokens]

        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
 
        tokens = [word.translate(str.maketrans('', '', string.punctuation)) for word in tokens]

        tokens = [word for word in tokens if word.strip()]

        stemmed_tokens = [stemmer.stem(word) for word in tokens]

        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
        
        return lemmatized_tokens
    else:
        return []

df['Review Text'] = df['Review Text'].apply(preprocess_text)

print(df['Review Text'])


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kavas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0                  [absolut, wonder, silki, sexi, comfort]
1        [love, dress, s, sooo, pretti, happen, find, s...
2        [high, hope, dress, realli, want, work, initi,...
3        [love, love, love, jumpsuit, s, fun, flirti, f...
4        [shirt, flatter, due, adjust, front, tie, perf...
                               ...                        
23481    [happi, snag, dress, great, price, s, easi, sl...
23482    [remind, matern, cloth, soft, stretchi, shini,...
23483    [fit, well, top, see, never, would, work, m, g...
23484    [bought, dress, wed, summer, s, cute, unfortun...
23485    [dress, love, platinum, feminin, fit, perfectl...
Name: Review Text, Length: 23486, dtype: object


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

filtered_df = df[df['Division Name'].isin(['General Petite', 'Intimates'])]

tokenized_texts = filtered_df['Review Text'].apply(lambda x: ' '.join(x))

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(tokenized_texts)

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def identify_similar_reviews(similarity_matrix, threshold=0.7):
    similar_reviews = {}
    for i in range(len(similarity_matrix)):
        similar_reviews[i] = [j for j, score in enumerate(similarity_matrix[i]) if score > threshold and j != i]
    return similar_reviews

similar_reviews = identify_similar_reviews(cosine_sim)

for i, similar_list in similar_reviews.items():
    if similar_list:
        print(f"Review {i}: Similar to {similar_list}")


Review 79: Similar to [151, 954, 5876]
Review 151: Similar to [79, 954, 5876]
Review 215: Similar to [4555]
Review 245: Similar to [3456]
Review 954: Similar to [79, 151, 5876]
Review 1437: Similar to [1916, 3420, 4925, 5030]
Review 1916: Similar to [1437, 2670, 3420, 3944, 4925, 5030]
Review 2284: Similar to [7868]
Review 2670: Similar to [1916, 3420, 4925, 5030]
Review 3157: Similar to [7567]
Review 3420: Similar to [1437, 1916, 2670, 3944, 4925, 5030]
Review 3456: Similar to [245]
Review 3602: Similar to [4961]
Review 3895: Similar to [4403]
Review 3944: Similar to [1916, 3420, 4925, 5030]
Review 4023: Similar to [4630]
Review 4403: Similar to [3895]
Review 4555: Similar to [215]
Review 4630: Similar to [4023]
Review 4925: Similar to [1437, 1916, 2670, 3420, 3944, 5030]
Review 4961: Similar to [3602]
Review 5030: Similar to [1437, 1916, 2670, 3420, 3944, 4925]
Review 5876: Similar to [79, 151, 954]
Review 7567: Similar to [3157]
Review 7868: Similar to [2284]
