# Detection of Fraud Reviews: Modeling and Evaluation

In [1]:
import pandas as pd

## Reading Fake Reviews datasets

In [2]:
# Path to your CSV file
fake_reviews_path = '/Users/ShanShan/Fake-Reviews-Detection/Dataset/fake reviews dataset.csv'

# Read the CSV file
fake_reviews_df = pd.read_csv(fake_reviews_path)

In [3]:
fake_reviews_df.shape

(40432, 4)

In [4]:
fake_reviews_df.columns

Index(['category', 'rating', 'label', 'text_'], dtype='object')

In [5]:
# Display the first few rows
fake_reviews_df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


## Reading Yelp dataset

In [6]:
# Path to your Excel file
yelp_path = '/Users/ShanShan/Fake-Reviews-Detection/Dataset/Yelp Labelled Review Dataset with Sentiments and Features.xlsx'

# Read the Excel file
yelp_df = pd.read_excel(yelp_path, engine='openpyxl')

In [7]:
yelp_df.shape

(355210, 8)

In [8]:
yelp_df.columns

Index(['User_id', 'Product_id', 'Rating', 'Date', 'Review',
       'Spam(1) and Not Spam(0)', 'Sentiment', 'Features'],
      dtype='object')

In [9]:
# Display the first few rows
yelp_df.head()

Unnamed: 0,User_id,Product_id,Rating,Date,Review,Spam(1) and Not Spam(0),Sentiment,Features
0,923,0,3,2014-01-30,The food at snack is a selection of popular Gr...,1,Positive,"['appetizer tray', 'greek salad', 'main courses']"
1,924,0,3,2011-05-05,This little place in Soho is wonderful. I had ...,1,Positive,"['little place', 'soho', 'lamb sandwich', 'soh..."
2,925,0,4,2011-12-30,ordered lunch for 15 from Snack last Friday. Ã...,1,Positive,"['snack', 'regular company lunch list']"
3,926,0,4,2012-10-04,This is a beautiful quaint little restaurant o...,1,Positive,"['beautiful quaint', 'pretty street', 'great p..."
4,927,0,4,2014-02-06,Snack is great place for a Ã‚Â casual sit down...,1,Positive,"['snack', 'great place', 'Ã¢ casual', 'cold wi..."


In [None]:
Index(['category', 'rating', 'label', 'text_'], dtype='object')

Index(['User_id', 'Product_id', 'Rating', 'Date', 'Review',
       'Spam(1) and Not Spam(0)', 'Sentiment', 'Features'],
      dtype='object')

In [None]:
Text Cleaning: Remove unnecessary characters, normalize, and clean text data.

In [None]:
# Define text cleaning function
def clean_text(text):
    # Remove punctuation, convert to lowercase, and remove stopwords
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d+', '', text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words)

# Apply to the 'text_' column
df_fake_reviews['cleaned_text'] = df_fake_reviews['text_'].apply(clean_text)
df_yelp_reviews['cleaned_review'] = df_yelp_reviews['Review'].apply(clean_text)


In [None]:
Feature Engineering: Create additional features such as review length, number of exclamation marks, etc.

In [None]:
# Review length
df_fake_reviews['review_length'] = df_fake_reviews['cleaned_text'].apply(lambda x: len(x.split()))
df_yelp_reviews['review_length'] = df_yelp_reviews['cleaned_review'].apply(lambda x: len(x.split()))

# Number of exclamation marks
df_fake_reviews['exclamation_count'] = df_fake_reviews['text_'].apply(lambda x: x.count('!'))
df_yelp_reviews['exclamation_count'] = df_yelp_reviews['Review'].apply(lambda x: x.count('!'))

# Visualize review length
sns.histplot(df_fake_reviews['review_length'], bins=50)
plt.title("Review Length Distribution in Fake Reviews")
plt.show()

sns.histplot(df_yelp_reviews['review_length'], bins=50)
plt.title("Review Length Distribution in Yelp Reviews")
plt.show()


In [None]:
The Yelp dataset already contains spam labels (Spam(1) and Not Spam(0)), while the Fake Reviews dataset uses CG in the label.

Fake Reviews Labeling:
# Convert 'CG' label into binary label (e.g., CG as 1 for fake, else 0)
df_fake_reviews['label'] = df_fake_reviews['label'].apply(lambda x: 1 if x == 'CG' else 0)


In [None]:
Prepare Data for Modeling:

Use TF-IDF to vectorize the cleaned review text.

from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_fake = vectorizer.fit_transform(df_fake_reviews['cleaned_text']).toarray()
X_yelp = vectorizer.fit_transform(df_yelp_reviews['cleaned_review']).toarray()

# Define the target labels
y_fake = df_fake_reviews['label']
y_yelp = df_yelp_reviews['Spam(1) and Not Spam(0)']


In [None]:
Train-Test Split:
# Split the data into training and testing sets
X_train_fake, X_test_fake, y_train_fake, y_test_fake = train_test_split(X_fake, y_fake, test_size=0.2, random_state=42)
X_train_yelp, X_test_yelp, y_train_yelp, y_test_yelp = train_test_split(X_yelp, y_yelp, test_size=0.2, random_state=42)

Train a Logistic Regression Model:

# Train a Logistic Regression Model
model_fake = LogisticRegression()
model_fake.fit(X_train_fake, y_train_fake)

model_yelp = LogisticRegression()
model_yelp.fit(X_train_yelp, y_train_yelp)


In [None]:
Step 5: Model Evaluation and Tuning
Evaluate Model Performance:

# Predictions for Fake Reviews
y_pred_fake = model_fake.predict(X_test_fake)

# Evaluate the model
print("Accuracy (Fake Reviews):", accuracy_score(y_test_fake, y_pred_fake))
print("Precision (Fake Reviews):", precision_score(y_test_fake, y_pred_fake))
print("Recall (Fake Reviews):", recall_score(y_test_fake, y_pred_fake))
print("F1 Score (Fake Reviews):", f1_score(y_test_fake, y_pred_fake))
print("ROC AUC (Fake Reviews):", roc_auc_score(y_test_fake, y_pred_fake))

# Predictions for Yelp Reviews
y_pred_yelp = model_yelp.predict(X_test_yelp)

# Evaluate the model
print("Accuracy (Yelp Reviews):", accuracy_score(y_test_yelp, y_pred_yelp))
print("Precision (Yelp Reviews):", precision_score(y_test_yelp, y_pred_yelp))
print("Recall (Yelp Reviews):", recall_score(y_test_yelp, y_pred_yelp))
print("F1 Score (Yelp Reviews):", f1_score(y_test_yelp, y_pred_yelp))
print("ROC AUC (Yelp Reviews):", roc_auc_score(y_test_yelp, y_pred_yelp))

Hyperparameter Tuning: Use grid search for hyperparameter tuning:


from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10, 100], 'penalty': ['l2']}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train_fake, y_train_fake)
print("Best Parameters:", grid_search.best_params_)



In [None]:
Step 6: Reporting and Visualization
Confusion Matrix:

from sklearn.metrics import confusion_matrix
import seaborn as sns

# Confusion matrix
cm = confusion_matrix(y_test_fake, y_pred_fake)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Fake Reviews')
plt.show()


In [None]:
Visualizing Feature Importance:

# Get feature importances for logistic regression
importance = np.abs(model_fake.coef_[0])
feature_names = vectorizer.get_feature_names_out()

# Sort feature importance and plot
