In [1]:
pip install datasets pandas scikit-learn nltk vaderSentiment transformers gensim





In [2]:
import datasets
datasets.logging.set_verbosity_error()

from datasets import load_dataset

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Amazon_Fashion", trust_remote_code=True)
print(dataset["full"][0])


  from .autonotebook import tqdm as notebook_tqdm


{'rating': 5.0, 'title': 'Pretty locket', 'text': 'I think this locket is really pretty. The inside back is a solid silver depression and the front is a dome that is not solid (knotted). You could use it to store a small photo, lock of hair, etc but I use it when I need to carry medication with me. Closes securely. High quality & very pretty.', 'images': [], 'asin': 'B00LOPVX74', 'parent_asin': 'B00LOPVX74', 'user_id': 'AGBFYI2DDIKXC5Y4FARTYDTQBMFQ', 'timestamp': 1578528394489, 'helpful_vote': 3, 'verified_purchase': True}


In [3]:
import pandas as pd
import re

# Convert to Pandas DataFrame for easier manipulation
df = pd.DataFrame(dataset['full'])

# Drop rows with missing values
df.dropna(subset=['text', 'rating'], inplace=True)

# Remove duplicates
df.drop_duplicates(subset=['text', 'user_id'], inplace=True)

# Preprocess text (lowercasing, removing punctuation)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['cleaned_text'] = df['text'].apply(preprocess_text)

print(df[['cleaned_text', 'rating']].head())


                                        cleaned_text  rating
0  i think this locket is really pretty the insid...     5.0
1                                              great     5.0
2  one of the stones fell out within the first 2 ...     2.0
3  crappy socks money wasted bought to wear with ...     1.0
4  i love these glasses  they fit perfectly over ...     5.0


In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download VADER lexicon
nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Analyze sentiment
df['sentiment'] = df['cleaned_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Categorize sentiment
def categorize_sentiment(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_category'] = df['sentiment'].apply(categorize_sentiment)

print(df[['cleaned_text', 'sentiment', 'sentiment_category']].head())


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\91830\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


                                        cleaned_text  sentiment  \
0  i think this locket is really pretty the insid...     0.8532   
1                                              great     0.6249   
2  one of the stones fell out within the first 2 ...     0.0000   
3  crappy socks money wasted bought to wear with ...    -0.6908   
4  i love these glasses  they fit perfectly over ...     0.9516   

  sentiment_category  
0           positive  
1           positive  
2            neutral  
3           negative  
4           positive  


In [5]:
import pandas as pd

# Path to your local CSV file
csv_file_path = r"D:\HackOn Amazon\codes\Datasets\fake reviews dataset.csv"

# Load the CSV file into a pandas DataFrame
fake_review_df = pd.read_csv(csv_file_path)

print(fake_review_df.head())


             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  
0  Love this!  Well made, sturdy, and very comfor...  
1  love it, a great upgrade from the original.  I...  
2  This pillow saved my back. I love the look and...  
3  Missing information on how to use it, but it i...  
4  Very nice set. Good quality. We have had the s...  


In [6]:
from sklearn.model_selection import train_test_split

# Prepare the dataset for BERT
fake_review_df['text'] = fake_review_df['text_'].apply(preprocess_text)
X = fake_review_df['text']
y = fake_review_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.head(), y_train.head())


20355    just what i needed  not a whole lot just a lit...
27881    i was given an arc in exchange for an honest r...
21349    the ingredients list makes me happy the only p...
29639    really interesting look at pinkers relationshi...
18760    we were upgrading the plumbing fixtures in one...
Name: text, dtype: object 20355    CG
27881    CG
21349    CG
29639    CG
18760    OR
Name: label, dtype: object


In [7]:
pip install tensorflow




In [8]:
pip install tqdm





In [9]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Function to convert data to InputExamples
def convert_data_to_examples(train, test):
    train_input_examples = train.apply(lambda x: InputExample(guid=None, 
                                                              text_a=x['text'], 
                                                              text_b=None, 
                                                              label=x['label']), axis=1)

    validation_input_examples = test.apply(lambda x: InputExample(guid=None, 
                                                                  text_a=x['text'], 
                                                                  text_b=None, 
                                                                  label=x['label']), axis=1)

    return train_input_examples, validation_input_examples

# Assuming X_train, y_train, X_test, y_test are defined elsewhere in your code
# Convert labels from CG and OR to 0 and 1
label_map = {'CG': 0, 'OR': 1}
y_train = y_train.map(label_map)
y_test = y_test.map(label_map)

train_data = pd.DataFrame({'text': X_train, 'label': y_train})
test_data = pd.DataFrame({'text': X_test, 'label': y_test})

train_InputExamples, validation_InputExamples = convert_data_to_examples(train_data, test_data)

# Function to convert examples to TensorFlow dataset
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] 
    for e in examples:
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        input_ids, token_type_ids = input_dict["input_ids"], input_dict["token_type_ids"]
        attention_mask = input_dict["attention_mask"]

        features.append(
            InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label)
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({
            "input_ids": tf.int32,
            "attention_mask": tf.int32,
            "token_type_ids": tf.int32,
        },
        tf.int64),
        ({
            "input_ids": tf.TensorShape([None]),
            "attention_mask": tf.TensorShape([None]),
            "token_type_ids": tf.TensorShape([None]),
        },
        tf.TensorShape([])),
    )

# Convert InputExamples to TensorFlow Datasets
train_dataset = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)

validation_dataset = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_dataset = validation_dataset.batch(32)

# Compile the model using the legacy Adam optimizer
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-5, epsilon=1e-08, decay=0.01, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Use a custom training loop without tqdm
epochs = 2

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    # Train
    for step, (batch_inputs, batch_labels) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            logits = model(batch_inputs, training=True)[0]
            loss_value = loss(batch_labels, logits)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    # Validation
    for batch_inputs, batch_labels in validation_dataset:
        logits = model(batch_inputs, training=False)[0]
        loss_value = loss(batch_labels, logits)
        metric.update_state(batch_labels, logits)

    print(f"Validation Accuracy: {metric.result().numpy()}")
    metric.reset_states()






All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/2


ResourceExhaustedError: Exception encountered when calling layer 'intermediate' (type TFBertIntermediate).

{{function_node __wrapped__AddV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} OOM when allocating tensor with shape[32,128,3072] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu [Op:AddV2] name: 

Call arguments received by layer 'intermediate' (type TFBertIntermediate):
  • hidden_states=tf.Tensor(shape=(32, 128, 768), dtype=float32)

In [None]:
import gensim
from gensim import corpora

# Preprocess text for LDA
df['tokenized_text'] = df['cleaned_text'].apply(gensim.utils.simple_preprocess)

# Create a dictionary and corpus
dictionary = corpora.Dictionary(df['tokenized_text'])
corpus = [dictionary.doc2bow(text) for text in df['tokenized_text']]

print(corpus[:1])


In [None]:
from gensim.models.ldamodel import LdaModel

# Set parameters
num_topics = 5
passes = 10

# Train LDA model
lda_model = LdaModel(corpus, 
                     num_topics=num_topics, 
                     id2word=dictionary, 
                     passes=passes)

# Print topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}")


In [None]:
# Sentiment Analysis Results
sentiment_results = df[['cleaned_text', 'sentiment_category']]

# Fake Review Detection
fake_review_results = test_data[['text']]
fake_review_results['predicted_label'] = model.predict(validation_dataset).logits.argmax(axis=1)

# Topic Modeling Results
topic_results = [lda_model[dictionary.doc2bow(text)] for text in df['tokenized_text']]

print("Sentiment Analysis Results:\n", sentiment_results.head())
print("Fake Review Detection Results:\n", fake_review_results.head())
print("Topic Modeling Results:\n", topic_results[:5])


In [None]:
import joblib

# Save models
joblib.dump(sia, 'vader_sentiment_analyzer.pkl')
joblib.dump(model, 'bert_fake_review_detector.pkl')
lda_model.save('lda_topic_model.model')

# Load models
sia = joblib.load('vader_sentiment_analyzer.pkl')
model = joblib.load('bert_fake_review_detector.pkl')
lda_model = LdaModel.load('lda_topic_model.model')


In [None]:
# Load the VADER sentiment analyzer
sia = joblib.load('vader_sentiment_analyzer.pkl')

# Analyze sentiment for your dataset
df['sentiment'] = df['cleaned_text'].apply(lambda x: sia.polarity_scores(x)['compound'])
df['sentiment_category'] = df['sentiment'].apply(categorize_sentiment)

# Function to categorize sentiment
def categorize_sentiment(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

print(df[['cleaned_text', 'sentiment', 'sentiment_category']].head())


In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Function to preprocess and tokenize the text for BERT
def preprocess_for_bert(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors='tf'
    )

# Prepare the dataset for prediction
input_ids = []
attention_masks = []
token_type_ids = []

for text in df['cleaned_text']:
    encoded_dict = preprocess_for_bert(text)
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    token_type_ids.append(encoded_dict['token_type_ids'])

# Convert lists to tensors
input_ids = tf.concat(input_ids, axis=0)
attention_masks = tf.concat(attention_masks, axis=0)
token_type_ids = tf.concat(token_type_ids, axis=0)


In [None]:
from gensim.models.ldamodel import LdaModel
import joblib

# Load the pre-trained LDA model and dictionary
lda_model = LdaModel.load('lda_topic_model.model')
dictionary = joblib.load('lda_dictionary.pkl')


In [None]:
# Tokenize the text
df['tokenized_text'] = df['cleaned_text'].apply(gensim.utils.simple_preprocess)

# Create the corpus
corpus = [dictionary.doc2bow(text) for text in df['tokenized_text']]


In [None]:
# Infer topics for each document
df['topics'] = [lda_model.get_document_topics(bow) for bow in corpus]

print(df[['cleaned_text', 'topics']].head())


In [None]:
# Display combined results
results = df[['cleaned_text', 'sentiment_category', 'fake_review_pred', 'topics']]
print(results.head())

# Save the results to a CSV file for further analysis
results.to_csv('nlp_analysis_results.csv', index=False)


In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification, TFTrainer, TFTrainingArguments
import tensorflow as tf

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Prepare data (assuming df contains 'text' and 'label' columns)
train_texts = df['cleaned_text'].tolist()
train_labels = df['label'].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

# Define training arguments
training_args = TFTrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=None
)

# Fine-tune the model
trainer.train()
