In [1]:
import pandas as pd
df = pd.read_csv("Processed_df.csv")

  df = pd.read_csv("Processed_df.csv")


feature engineering: polarity and subjectivity

Polarity: numerical score that quantifies the sentiment of the text on a continuous scale. It measures how positive or negative the text is. Polarity scores typically range from -1 (extremely negative) to 1 (extremely positive), with 0 indicating neutral sentiment.

Subjectivity: Subjectivity measures the degree to which the text is subjective or opinion-based rather than objective. Subjectivity is also represented as a numerical score ranging from 0 to 1. A score closer to 0 suggests that the text is more objective, factual, or informational. A score closer to 1 suggests that the text is more subjective and opinion-based.

In [3]:
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from textblob import TextBlob
import string

# Tokenization
tokenizer = WordPunctTokenizer()
df['tokens'] = df['reviewContent'].apply(tokenizer.tokenize)

# Define a function to remove stopwords and punctuation
def preprocess_text(tokens):
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens]  # Convert to lowercase
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [word for word in tokens if word not in string.punctuation]  # Remove punctuation
    tokens = [word for word in tokens if len(word) > 1]  # Remove single-character words
    return tokens

# Apply preprocessing to the 'tokens' column
df['clean_tokens'] = df['tokens'].apply(preprocess_text)

# Join the clean tokens back into sentences
df['clean_text'] = df['clean_tokens'].apply(lambda x: ' '.join(x))

# Sentiment Analysis using TextBlob
df['sentiment'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment)

# Extract polarity and subjectivity scores from the sentiment analysis
df['polarity'] = df['sentiment'].apply(lambda x: x.polarity)
df['subjectivity'] = df['sentiment'].apply(lambda x: x.subjectivity)

N-gram analysis (common phrases)

In [4]:
from nltk.util import ngrams
from collections import Counter
# Define the n-gram range (e.g., 2 for bigrams)
ngram_range = 2  # For bigrams (adjust as needed)

# Function to extract n-grams from a list of tokens
def extract_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Extract n-grams and store them in a new column
df['ngrams'] = df['clean_tokens'].apply(lambda x: extract_ngrams(x, ngram_range))

# Flatten the list of n-grams
all_ngrams = [ngram for ngram_list in df['ngrams'] for ngram in ngram_list]

# Count the frequency of each n-gram
ngram_freq = Counter(all_ngrams)

# Print the most common n-grams and their frequencies
most_common_ngrams = ngram_freq.most_common(10)
for ngram, freq in most_common_ngrams:
    print(f"N-gram: {' '.join(ngram)} - Frequency: {freq}")

N-gram: go back - Frequency: 2121
N-gram: really good - Frequency: 1303
N-gram: first time - Frequency: 1154
N-gram: food good - Frequency: 1143
N-gram: great food - Frequency: 1094
N-gram: deep dish - Frequency: 1061
N-gram: pretty good - Frequency: 1013
N-gram: food great - Frequency: 980
N-gram: great place - Frequency: 941
N-gram: love place - Frequency: 916


## BERT Model

In [10]:
!pip3 install torch



You should consider upgrading via the 'C:\Users\fongj\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [16]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, random_split
import re 

ModuleNotFoundError: No module named 'transformers.utils'

In [None]:
!pip3 install torch torchvision torchaudio

### Instantiate Model 

In [None]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [None]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt', max_length=512, truncation=True)
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [None]:
df["bert_sentiment"] = df["reviewContent"].apply(lambda x: sentiment_score(x[:512]))

In [None]:
df["bert_sentiment"].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming you have a DataFrame 'df' with columns 'bert_sentiment' and 'class'

# Create separate histograms for fraud and non-fraud classes
plt.figure(figsize=(10, 6))

# Histogram for the "bert_sentiment" of the fraud class
sns.histplot(data=df[df['flagged'] == "Y"], x='bert_sentiment', kde=True, color='red', label='Fraud')

# Histogram for the "bert_sentiment" of the non-fraud class
sns.histplot(data=df[df['flagged'] == 'N'], x='bert_sentiment', kde=True, color='blue', label='Non-Fraud')

plt.title('Distribution of BERT Sentiment Scores')
plt.xlabel('BERT Sentiment')
plt.ylabel('Frequency')
plt.legend()

# Set the x-axis limits to be between 1 and 5
plt.xlim(1, 5)

plt.show()

In [None]:
fig, ax = plt.subplots()
sns.kdeplot(df[df["flagged"]=="N"]["bert_sentiment"], shade=True, color="green", label="Genuine", ax=ax)

sns.kdeplot(df[df["flagged"]=="Y"]["bert_sentiment"], shade=True, color="blue", label="Fraudulent", ax=ax)
ax.set_xlabel("Sentiment Score ")
ax.set_ylabel("Density")
ax.legend()
fig.suptitle("Sentiment Value vs Fraud Class")

In [None]:
[-]
df_bert = pd.DataFrame({
    'id': range(len(df)),
    'label': df['flagged'],
    'alpha': ['a']*df.shape[0],
    'text': df['reviewContent'].replace(r'\n', ' ', regex=True)
})
# Splitting training data file into *train* and *dev*
df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.01)

df_bert_train.head()

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, Trainer, TrainingArguments

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the data
train_encodings = tokenizer(df_bert_train['text'].tolist(), truncation=True, padding=True, max_length=128)
dev_encodings = tokenizer(df_bert_dev['text'].tolist(), truncation=True, padding=True, max_length=128)

# Create a Dataset class
import torch
from torch.utils.data import Dataset

class ReviewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewsDataset(train_encodings, df_bert_train['label'].tolist())
dev_dataset = ReviewsDataset(dev_encodings, df_bert_dev['label'].tolist())

# Model and training setup
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    do_eval=True,
    output_dir='./results',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

# Fine-tune the model
trainer.train()