In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Ensure nltk packages are downloaded
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to C:\Users\Hammad
[nltk_data]     Aslam/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [10]:
df = pd.read_csv('fake_reviews_dataset.csv')
df.head(2)

Unnamed: 0,category,rating,text,label
0,Home_and_Kitchen,5.0,"Love this! Well made, sturdy, and very comfor...",1
1,Home_and_Kitchen,5.0,"love it, a great upgrade from the original. I...",1


In [11]:
# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Step 1: Text Preprocessing and Feature Engineering

# Apply TF-IDF Vectorizer on the 'text' column
tfidf = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf.fit_transform(df['text']).toarray()

In [12]:
# Calculate sentiment score for each review
df["sentiment_score"] = df["text"].apply(
    lambda text: sia.polarity_scores(text)["compound"]
)

# Encode 'category' as a numeric label
le_category = LabelEncoder()
df["category_encoded"] = le_category.fit_transform(df["category"])

# Optional: Scale 'rating' and 'sentiment_score' features for model consistency
scaler = StandardScaler()
df[["rating_scaled", "sentiment_score_scaled"]] = scaler.fit_transform(
    df[["rating", "sentiment_score"]]
)

# Combine processed text features (TF-IDF) with other features
features = pd.concat(
    [
        df[["category_encoded", "rating_scaled", "sentiment_score_scaled"]],
        pd.DataFrame(tfidf_matrix),
    ],
    axis=1,
)

features.columns = features.columns.astype(str)


In [18]:
features.head()

Unnamed: 0,category_encoded,rating_scaled,sentiment_score_scaled,0,1,2,3,4,5,6,...,490,491,492,493,494,495,496,497,498,499
0,9,0.650704,0.747276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9,0.650704,0.602012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.377405,0.0,0.0,0.0,0.0
2,9,0.650704,0.388478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,-2.844964,0.294259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9,0.650704,0.280222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
X_train, X_test, y_train, y_test = train_test_split(features, df['label'], test_size=0.2, random_state=42)


# Logistic Regression

In [30]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print(accuracy_score(y_test, y_pred)*100)
print(classification_report(y_test, y_pred))

86.71354552183568
              precision    recall  f1-score   support

           0       0.87      0.86      0.87      4031
           1       0.87      0.87      0.87      4075

    accuracy                           0.87      8106
   macro avg       0.87      0.87      0.87      8106
weighted avg       0.87      0.87      0.87      8106



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# decision tree classifier

In [31]:
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)
print(accuracy_score(y_test, y_pred)*100)
print(classification_report(y_test, y_pred))

73.99457192203306
              precision    recall  f1-score   support

           0       0.74      0.73      0.74      4031
           1       0.74      0.75      0.74      4075

    accuracy                           0.74      8106
   macro avg       0.74      0.74      0.74      8106
weighted avg       0.74      0.74      0.74      8106



# random forest

In [32]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred)*100)
print(classification_report(y_test, y_pred))

87.29336294103133
              precision    recall  f1-score   support

           0       0.86      0.89      0.87      4031
           1       0.88      0.86      0.87      4075

    accuracy                           0.87      8106
   macro avg       0.87      0.87      0.87      8106
weighted avg       0.87      0.87      0.87      8106



# xgboost

In [33]:
hgbc =HistGradientBoostingClassifier(max_iter=100)
hgbc.fit(X_train, y_train)

y_pred = hgbc.predict(X_test)
print(accuracy_score(y_test, y_pred)*100)
print(classification_report(y_test, y_pred))

88.05822847273625
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      4031
           1       0.89      0.87      0.88      4075

    accuracy                           0.88      8106
   macro avg       0.88      0.88      0.88      8106
weighted avg       0.88      0.88      0.88      8106



# naive bayes

In [34]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)
print(accuracy_score(y_test, y_pred)*100)
print(classification_report(y_test, y_pred))

81.0387367382186
              precision    recall  f1-score   support

           0       0.81      0.80      0.81      4031
           1       0.81      0.82      0.81      4075

    accuracy                           0.81      8106
   macro avg       0.81      0.81      0.81      8106
weighted avg       0.81      0.81      0.81      8106

