In [1]:
pip install datasets pandas scikit-learn nltk vaderSentiment transformers gensim


Note: you may need to restart the kernel to use updated packages.


In [2]:
import datasets
datasets.logging.set_verbosity_error()

from datasets import load_dataset

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Amazon_Fashion", trust_remote_code=True)
print(dataset["full"][0])


{'rating': 5.0, 'title': 'Pretty locket', 'text': 'I think this locket is really pretty. The inside back is a solid silver depression and the front is a dome that is not solid (knotted). You could use it to store a small photo, lock of hair, etc but I use it when I need to carry medication with me. Closes securely. High quality & very pretty.', 'images': [], 'asin': 'B00LOPVX74', 'parent_asin': 'B00LOPVX74', 'user_id': 'AGBFYI2DDIKXC5Y4FARTYDTQBMFQ', 'timestamp': 1578528394489, 'helpful_vote': 3, 'verified_purchase': True}


In [3]:
import pandas as pd
import re

# Convert to Pandas DataFrame for easier manipulation
df = pd.DataFrame(dataset['full'])

# Drop rows with missing values
df.dropna(subset=['text', 'rating'], inplace=True)

# Remove duplicates
df.drop_duplicates(subset=['text', 'user_id'], inplace=True)

# Preprocess text (lowercasing, removing punctuation)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['cleaned_text'] = df['text'].apply(preprocess_text)

print(df[['cleaned_text', 'rating']].head())


                                        cleaned_text  rating
0  i think this locket is really pretty the insid...     5.0
1                                              great     5.0
2  one of the stones fell out within the first 2 ...     2.0
3  crappy socks money wasted bought to wear with ...     1.0
4  i love these glasses  they fit perfectly over ...     5.0


In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download VADER lexicon
nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Analyze sentiment
df['sentiment'] = df['cleaned_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Categorize sentiment
def categorize_sentiment(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_category'] = df['sentiment'].apply(categorize_sentiment)

print(df[['cleaned_text', 'sentiment', 'sentiment_category']].head())


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/dgxuser55/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


                                        cleaned_text  sentiment  \
0  i think this locket is really pretty the insid...     0.8532   
1                                              great     0.6249   
2  one of the stones fell out within the first 2 ...     0.0000   
3  crappy socks money wasted bought to wear with ...    -0.6908   
4  i love these glasses  they fit perfectly over ...     0.9516   

  sentiment_category  
0           positive  
1           positive  
2            neutral  
3           negative  
4           positive  


In [None]:
import pandas as pd

# Path to your local CSV file
csv_file_path = r"fake reviews dataset.csv"

# Load the CSV file into a pandas DataFrame
fake_review_df = pd.read_csv(csv_file_path)

print(fake_review_df.head())


In [None]:
from sklearn.model_selection import train_test_split

# Prepare the dataset for BERT
fake_review_df['text'] = fake_review_df['text_'].apply(preprocess_text)
X = fake_review_df['text']
y = fake_review_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.head(), y_train.head())


In [None]:
pip install tensorflow

In [None]:
pip install tqdm


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures
from tqdm import tqdm

# Function to preprocess text (define this as needed)
def preprocess_text(text):
    return text.lower()

# Load the CSV file into a pandas DataFrame
csv_file_path = r"fake reviews dataset.csv"
fake_review_df = pd.read_csv(csv_file_path)

# Remove rows with NaN values
fake_review_df.dropna(subset=['text_', 'label'], inplace=True)

# Preprocess the text data
fake_review_df['text'] = fake_review_df['text_'].apply(preprocess_text)

# Prepare the dataset for BERT
X = fake_review_df['text']
y = fake_review_df['label']

# Split the data into training and testing sets (using a smaller subset for quick testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Use a smaller subset for faster training
X_train = X_train[:5000]
y_train = y_train[:5000]
X_test = X_test[:1000]
y_test = y_test[:1000]

# Convert labels from CG and OR to 0 and 1
label_map = {'CG': 0, 'OR': 1}
y_train = y_train.map(label_map)
y_test = y_test.map(label_map)

# Verify no NaN values are present
assert y_train.isna().sum() == 0, "NaN values found in y_train"
assert y_test.isna().sum() == 0, "NaN values found in y_test"

train_data = pd.DataFrame({'text': X_train, 'label': y_train})
test_data = pd.DataFrame({'text': X_test, 'label': y_test})

# Function to convert data to InputExamples
def convert_data_to_examples(train, test):
    train_input_examples = train.apply(lambda x: InputExample(guid=None, 
                                                              text_a=x['text'], 
                                                              text_b=None, 
                                                              label=x['label']), axis=1)

    validation_input_examples = test.apply(lambda x: InputExample(guid=None, 
                                                                  text_a=x['text'], 
                                                                  text_b=None, 
                                                                  label=x['label']), axis=1)

    return train_input_examples, validation_input_examples

train_InputExamples, validation_InputExamples = convert_data_to_examples(train_data, test_data)

# Function to convert examples to TensorFlow dataset
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] 
    for e in examples:
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            truncation=True
        )
        input_ids, token_type_ids = input_dict["input_ids"], input_dict["token_type_ids"]
        attention_mask = input_dict["attention_mask"]

        features.append(
            InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label)
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({
            "input_ids": tf.int32,
            "attention_mask": tf.int32,
            "token_type_ids": tf.int32,
        },
        tf.int64),
        ({
            "input_ids": tf.TensorShape([None]),
            "attention_mask": tf.TensorShape([None]),
            "token_type_ids": tf.TensorShape([None]),
        },
        tf.TensorShape([])),
    )

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Convert InputExamples to TensorFlow Datasets
train_dataset = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_dataset = train_dataset.shuffle(100).batch(32)  # Removed .repeat(2)

validation_dataset = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_dataset = validation_dataset.batch(32)

# Compile the model using the legacy Adam optimizer
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-5, epsilon=1e-08, decay=0.01, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Calculate the length of the training set
len_X_train = len(X_train)
print(f"Length of the training set: {len_X_train}")

# Calculate the total number of batches per epoch and remaining iterations
average_time_per_iteration = 6.44  # seconds
batch_size = 32

# Calculate total batches per epoch
total_batches = len_X_train // batch_size

# Use a custom training loop with tqdm for progress bar
epochs = 1  # Reduce the number of epochs for quicker training

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    # Train
    for step, (batch_inputs, batch_labels) in tqdm(enumerate(train_dataset), desc="Training", total=total_batches):
        with tf.GradientTape() as tape:
            logits = model(batch_inputs, training=True)[0]
            loss_value = loss(batch_labels, logits)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    # Validation
    for batch_inputs, batch_labels in tqdm(validation_dataset, desc="Validation"):
        logits = model(batch_inputs, training=False)[0]
        loss_value = loss(batch_labels, logits)
        metric.update_state(batch_labels, logits)

    print(f"Validation Accuracy: {metric.result().numpy()}")
    metric.reset_states()



All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Length of the training set: 5000
Epoch 1/1


Training: 157it [24:23,  9.32s/it]                                          
Validation: 32it [01:20,  2.51s/it]

Validation Accuracy: 0.9580000042915344





In [5]:
import gensim
from gensim import corpora

# Preprocess text for LDA
df['tokenized_text'] = df['cleaned_text'].apply(gensim.utils.simple_preprocess)

# Create a dictionary and corpus
dictionary = corpora.Dictionary(df['tokenized_text'])
corpus = [dictionary.doc2bow(text) for text in df['tokenized_text']]

print(corpus[:1])


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 4), (14, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 1), (33, 2), (34, 1), (35, 1), (36, 2), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1)]]


In [12]:
from gensim.models.ldamodel import LdaModel

# Set parameters
num_topics = 5
passes = 1

# Train LDA model
lda_model = LdaModel(corpus, 
                     num_topics=num_topics, 
                     id2word=dictionary, 
                     passes=passes)

# Print topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}")


Topic: 0 
Words: 0.067*"they" + 0.049*"them" + 0.046*"are" + 0.045*"these" + 0.031*"and" + 0.018*"my" + 0.018*"the" + 0.015*"to" + 0.015*"for" + 0.014*"were"
Topic: 1 
Words: 0.056*"my" + 0.055*"it" + 0.052*"for" + 0.035*"and" + 0.033*"was" + 0.024*"this" + 0.023*"as" + 0.019*"loved" + 0.014*"great" + 0.014*"she"
Topic: 2 
Words: 0.064*"the" + 0.035*"it" + 0.029*"and" + 0.026*"but" + 0.024*"is" + 0.021*"was" + 0.020*"not" + 0.020*"to" + 0.017*"size" + 0.015*"like"
Topic: 3 
Words: 0.070*"and" + 0.046*"love" + 0.041*"very" + 0.038*"it" + 0.037*"great" + 0.033*"the" + 0.027*"is" + 0.025*"good" + 0.024*"this" + 0.023*"quality"
Topic: 4 
Words: 0.071*"the" + 0.032*"to" + 0.031*"it" + 0.025*"and" + 0.020*"of" + 0.016*"is" + 0.014*"this" + 0.013*"in" + 0.013*"on" + 0.012*"that"


In [13]:
# Sentiment Analysis Results
sentiment_results = df[['cleaned_text', 'sentiment_category']]

# Fake Review Detection
fake_review_results = test_data[['text']]
fake_review_results['predicted_label'] = model.predict(validation_dataset).logits.argmax(axis=1)

# Topic Modeling Results
topic_results = [lda_model[dictionary.doc2bow(text)] for text in df['tokenized_text']]

print("Sentiment Analysis Results:\n", sentiment_results.head())
print("Fake Review Detection Results:\n", fake_review_results.head())
print("Topic Modeling Results:\n", topic_results[:5])


NameError: name 'test_data' is not defined

In [None]:
import joblib

# Save models
joblib.dump(sia, 'vader_sentiment_analyzer.pkl')
joblib.dump(model, 'bert_fake_review_detector.pkl')
lda_model.save('lda_topic_model.model')

# Load models
sia = joblib.load('vader_sentiment_analyzer.pkl')
model = joblib.load('bert_fake_review_detector.pkl')
lda_model = LdaModel.load('lda_topic_model.model')


In [None]:
# Load the VADER sentiment analyzer
sia = joblib.load('vader_sentiment_analyzer.pkl')

# Analyze sentiment for your dataset
df['sentiment'] = df['cleaned_text'].apply(lambda x: sia.polarity_scores(x)['compound'])
df['sentiment_category'] = df['sentiment'].apply(categorize_sentiment)

# Function to categorize sentiment
def categorize_sentiment(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

print(df[['cleaned_text', 'sentiment', 'sentiment_category']].head())


In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Function to preprocess and tokenize the text for BERT
def preprocess_for_bert(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors='tf'
    )

# Prepare the dataset for prediction
input_ids = []
attention_masks = []
token_type_ids = []

for text in df['cleaned_text']:
    encoded_dict = preprocess_for_bert(text)
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    token_type_ids.append(encoded_dict['token_type_ids'])

# Convert lists to tensors
input_ids = tf.concat(input_ids, axis=0)
attention_masks = tf.concat(attention_masks, axis=0)
token_type_ids = tf.concat(token_type_ids, axis=0)


In [None]:
from gensim.models.ldamodel import LdaModel
import joblib

# Load the pre-trained LDA model and dictionary
lda_model = LdaModel.load('lda_topic_model.model')
dictionary = joblib.load('lda_dictionary.pkl')


In [None]:
# Tokenize the text
df['tokenized_text'] = df['cleaned_text'].apply(gensim.utils.simple_preprocess)

# Create the corpus
corpus = [dictionary.doc2bow(text) for text in df['tokenized_text']]


In [None]:
# Infer topics for each document
df['topics'] = [lda_model.get_document_topics(bow) for bow in corpus]

print(df[['cleaned_text', 'topics']].head())


In [None]:
# Display combined results
results = df[['cleaned_text', 'sentiment_category', 'fake_review_pred', 'topics']]
print(results.head())

# Save the results to a CSV file for further analysis
results.to_csv('nlp_analysis_results.csv', index=False)


In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification, TFTrainer, TFTrainingArguments
import tensorflow as tf

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Prepare data (assuming df contains 'text' and 'label' columns)
train_texts = df['cleaned_text'].tolist()
train_labels = df['label'].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

# Define training arguments
training_args = TFTrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=None
)

# Fine-tune the model
trainer.train()
