In [1]:
import re
import time

import pandas as pd
from nltk.tokenize import ToktokTokenizer
from nltk.corpus import stopwords as stp
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


def text_normalize(samples: pd.Series) -> pd.Series:
    samples = samples.apply(lambda x: x.lower()) # convert all samples to lower case

    samples = samples.apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x)) # replace any character which is not string or space with blank

    return samples

In [2]:

def text_tokenize(samples: pd.Series) -> pd.Series:
    toktok = ToktokTokenizer() # Much better speed than word_tokenize
    samples = samples.apply(lambda x: toktok.tokenize(x))

    return samples

In [3]:

def remove_stopwords(samples: pd.Series) -> pd.Series:
    stopwords = set(stp.words('english'))
    
    clean_samples = samples.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

    return clean_samples

In [4]:

def text_lemmatize(samples: pd.Series) -> pd.Series:
    wnl = WordNetLemmatizer()

    samples = samples.apply(lambda x: wnl.lemmatize(x))

    return samples

In [5]:

def tfidf(samples: pd.Series) -> pd.Series:
    return TfidfVectorizer().fit_transform(samples)

In [18]:
df = pd.read_csv('fakeReviewData.csv')

df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty"
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years"
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I"
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the set for two months now and have not been


In [19]:
df['processed_text'] = text_normalize(df['text_'])

df['processed_text'][0]

'love this  well made sturdy and very comfortable  i love itvery pretty'

In [20]:
df['processed_text'] = remove_stopwords(df['processed_text'])

In [21]:
df['processed_text'] = text_lemmatize(df['processed_text'])

In [22]:
td = TfidfVectorizer()

# td.fit_transform(df['text_']).toarray()bbr

In [23]:
df['text_'].tolist()

['Love this!  Well made, sturdy, and very comfortable.  I love it!Very pretty',
 "love it, a great upgrade from the original.  I've had mine for a couple of years",
 'This pillow saved my back. I love the look and feel of this pillow.',
 'Missing information on how to use it, but it is a great product for the price!  I',
 'Very nice set. Good quality. We have had the set for two months now and have not been',
 'I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.',
 'They are the perfect touch for me and the only thing I wish they had a little more space.',
 'These done fit well and look great.  I love the smoothness of the edges and the extra',
 "Great big numbers & easy to read, the only thing I didn't like is the size of the",
 'My son loves this comforter and it is very well made.  We also have a baby',
 "As advertised. 5th one I've had. The only problem is that it's not really a",
 'Very handy for one of my kids and the tools are included in the package. I have one in',
 'Did someone say,

In [24]:
df['processed_text'].tolist()

['love well made sturdy comfortable love itvery pretty',
 'love great upgrade original ive mine couple years',
 'pillow saved back love look feel pillow',
 'missing information use great product price',
 'nice set good quality set two months',
 'wanted different flavors',
 'perfect touch thing wish little space',
 'done fit well look great love smoothness edges extra',
 'great big numbers easy read thing didnt like size',
 'son loves comforter well made also baby',
 'advertised th one ive problem really',
 'handy one kids tools included package one',
 'someone say oriental great product',
 'flimsy quality would expect piece furniture',
 'makes may tea stirring problem kind hard put',
 'absolutely adorable excellent price wooden ones months',
 'love perfect size entire familyvery good quality',
 'look beautiful nice problem really mesh one',
 'exactly would expect love look feel pillow',
 'stars would highly recommend item love blanket',
 'great little egg masher months',
 'advertised e

In [25]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and efficient

# Convert text to embeddings
embeddings = model.encode(df['processed_text'].tolist(), batch_size=32, show_progress_bar=True)

embeddings

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

array([[-0.06552677,  0.00191031,  0.04567615, ..., -0.01979848,
         0.04743516,  0.02752347],
       [-0.1199374 ,  0.08962215,  0.06296311, ...,  0.03416116,
        -0.02431541,  0.01837237],
       [-0.05317529,  0.0186075 ,  0.03800445, ...,  0.00214915,
         0.04988267, -0.02065089],
       ...,
       [-0.09343169,  0.0228139 ,  0.07198668, ..., -0.08440357,
        -0.02782168,  0.02169881],
       [-0.03023172,  0.07027808,  0.10478224, ..., -0.04368389,
        -0.00730945, -0.02266176],
       [-0.07920948, -0.02798494,  0.07639812, ..., -0.0816136 ,
        -0.0256085 ,  0.01277232]], shape=(40432, 384), dtype=float32)

In [27]:
df['embeddings'] = list(embeddings)

In [None]:
df.head()

In [1]:
import pandas as pd

pd.read_json('processed_data.json', orient='records', lines=True)

Unnamed: 0,category,rating,label,text_,text_processed,text_tokenized,text_embeddings
0,Home and Kitchen,5,CG,"Love this! Well made, sturdy, and very comfor...",love well made sturdy comfortable love itvery ...,"[love, well, made, sturdy, comfortable, love, ...","[-0.06552676860000001, 0.0019102807000000001, ..."
1,Home and Kitchen,5,CG,"love it, a great upgrade from the original. I...",love great upgrade original ive mine couple years,"[love, great, upgrade, original, ive, mine, co...","[-0.11993740500000001, 0.0896221548, 0.0629631..."
2,Home and Kitchen,5,CG,This pillow saved my back. I love the look and...,pillow saved back love look feel pillow,"[pillow, saved, back, love, look, feel, pillow]","[-0.0531752855, 0.0186075028, 0.0380044468, 0...."
3,Home and Kitchen,1,CG,"Missing information on how to use it, but it i...",missing information use great product price,"[missing, information, use, great, product, pr...","[0.007157366300000001, 0.0636947528, 0.0084662..."
4,Home and Kitchen,5,CG,Very nice set. Good quality. We have had the s...,nice set good quality set two months,"[nice, set, good, quality, set, two, months]","[-0.0369832478, 0.011753053400000001, 0.041596..."
...,...,...,...,...,...,...,...
40415,Clothing Shoes and Jewelry,4,OR,I had read some reviews saying that this bra r...,read reviews saying bra ran small ordered two ...,"[read, reviews, saying, bra, ran, small, order...",
40416,Clothing Shoes and Jewelry,5,CG,I wasn't sure exactly what it would be. It is ...,wasnt sure exactly would little large small si...,"[wasnt, sure, exactly, would, little, large, s...",
40417,Clothing Shoes and Jewelry,2,OR,"You can wear the hood by itself, wear it with ...",wear hood wear hood wear jacket without hood s...,"[wear, hood, wear, hood, wear, jacket, without...",
40418,Clothing Shoes and Jewelry,1,CG,I liked nothing about this dress. The only rea...,liked nothing dress reason gave stars ordered ...,"[liked, nothing, dress, reason, gave, stars, o...",
