<div class="markdown-google-sans">

## Preprocessing code
</div>

Fake review detector. Pre processed data.

In [None]:
import pandas as pd
import string
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

#review data file loading
df = pd.read_csv('/content/fakeReviewData.csv')

# Check the data structure
print(df.head(), df.info(),"\n--------")
print("Rows containing NULL values",df.isnull().sum())
print ("\n--------")
print ("Duplicate value containing rows", df.duplicated().sum())
print ("\n--------")
print("Data Scan Completed")

# Data cleaning
df = df.drop_duplicates()
print("\nDuplicate Data Removed")

df = df.dropna(subset=['text_'])
print("\nRemoved Rows containing NULL values")

df = df[df['text_'].str.len() > 10]
print("\nRemoved Rows containing short reviews\n")

print(df.info())
print("\nDuplicates:", df.duplicated().sum())


# Data Normalization
def normalize_text(text): # function to make lowercse and remove numbers,special char,punctuation
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    return text

df['nrm_text'] = df['text_'].apply(normalize_text) # aplly funtion to all text

print("\n",df[['text_', 'nrm_text']].head()) # comparision

print("\nData Normalized\n")

# tokanization of data
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # pretrained data

from nltk.tokenize import word_tokenize

df['tokens'] = df['nrm_text'].apply(word_tokenize)
print("\ntokanization Complete")
print("\n",df[['text_', 'tokens']].head()) # comparision


# Stopword removal
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))  # Load English stopwords
df['filt_tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

print("\nStopwords Removed")
print("\n",df[['tokens', 'filt_tokens']].head()) # comparision


# Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
df['lemma_tokens'] = df['filt_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

print("\nLemmatized tokens")
print("\n",df[['filt_tokens','lemma_tokens']].head()) # comparision

#Vectorize output
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
df['final_text'] = df['lemma_tokens'].apply(lambda tokens: ' '.join(tokens))  # Convert list to text
print("\n",df[['lemma_tokens','final_text']].head()) # comparision

df.to_csv("preprocessed_reviews.csv", index=False)
print("pre done")

X = vectorizer.fit_transform(tqdm(df['final_text']))  # Convert text to numerical vectors
vec_matrix =pd.DataFrame.sparse.from_spmatrix(X,columns=vectorizer.get_feature_names_out())

vec_matrix.to_csv('tfidf_matrix.csv', index=False)
print("pre processed data ready")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  40432 non-null  object 
 1   rating    40432 non-null  float64
 2   label     40432 non-null  object 
 3   text_     40432 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.2+ MB
             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  
0  Love this!  Well made, sturdy, and very comfor...  
1  love it, a great upgrade from the original.  I...  
2  This pillow saved my back. I love the look and...  
3  Missing information on how to use it, but it i...  
4  Very nice set. Good quality. We have had the s...   None 
--------
Rows containing NULL value

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



tokanization Complete

                                                text_  \
0  Love this!  Well made, sturdy, and very comfor...   
1  love it, a great upgrade from the original.  I...   
2  This pillow saved my back. I love the look and...   
3  Missing information on how to use it, but it i...   
4  Very nice set. Good quality. We have had the s...   

                                              tokens  
0  [love, this, well, made, sturdy, and, very, co...  
1  [love, it, a, great, upgrade, from, the, origi...  
2  [this, pillow, saved, my, back, i, love, the, ...  
3  [missing, information, on, how, to, use, it, b...  
4  [very, nice, set, good, quality, we, have, had...  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.



Stopwords Removed

                                               tokens  \
0  [love, this, well, made, sturdy, and, very, co...   
1  [love, it, a, great, upgrade, from, the, origi...   
2  [this, pillow, saved, my, back, i, love, the, ...   
3  [missing, information, on, how, to, use, it, b...   
4  [very, nice, set, good, quality, we, have, had...   

                                         filt_tokens  
0  [love, well, made, sturdy, comfortable, love, ...  
1  [love, great, upgrade, original, mine, couple,...  
2    [pillow, saved, back, love, look, feel, pillow]  
3  [missing, information, use, great, product, pr...  
4       [nice, set, good, quality, set, two, months]  


[nltk_data] Downloading package wordnet to /root/nltk_data...



Lemmatized tokens

                                          filt_tokens  \
0  [love, well, made, sturdy, comfortable, love, ...   
1  [love, great, upgrade, original, mine, couple,...   
2    [pillow, saved, back, love, look, feel, pillow]   
3  [missing, information, use, great, product, pr...   
4       [nice, set, good, quality, set, two, months]   

                                        lemma_tokens  
0  [love, well, made, sturdy, comfortable, love, ...  
1  [love, great, upgrade, original, mine, couple,...  
2    [pillow, saved, back, love, look, feel, pillow]  
3  [missing, information, use, great, product, pr...  
4        [nice, set, good, quality, set, two, month]  

                                         lemma_tokens  \
0  [love, well, made, sturdy, comfortable, love, ...   
1  [love, great, upgrade, original, mine, couple,...   
2    [pillow, saved, back, love, look, feel, pillow]   
3  [missing, information, use, great, product, pr...   
4        [nice, set, good, qua

100%|██████████| 40420/40420 [00:00<00:00, 40662.69it/s]
