In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib


In [4]:
file_path = 'amazon_reviews_labelled.csv'
dataset = pd.read_csv(file_path, header=None)

  dataset = pd.read_csv(file_path, header=None)


In [5]:
# Rename columns for easier access based on observed data patterns
dataset.columns = ['Index', 'Unnamed0_3', 'Unnamed0_2', 'Unnamed0_1', 'Unnamed0', 'Rating',
                   'VerifiedPurchase', 'ReviewTitle', 'ReviewText', 'NumNouns', 'NumVerbs',
                   'NumAdjectives', 'NumAdverbs', 'NumParticles', 'NumPronouns', 'NumConjunctions',
                   'NumInterjections', 'NumDeterminers', 'NumPrepositions', 'NumAuxVerbs',
                   'NumProperNouns', 'NumModals', 'NumPossessivePronouns', 'NumQuantifiers',
                   'NumForeignWords', 'NumOtherWords', 'CapitalCharCount', 'PunctuationCount',
                   'PreprocessedReviewText', 'WordCount', 'SentimentScoreTitle', 'SentimentLabelTitle',
                   'AvgRatingVerified', 'AvgRatingNonVerified', 'DeviationVerified', 'DeviationNonVerified']

In [6]:
# Select only relevant columns and remove unnecessary rows
review_text_column = 'PreprocessedReviewText'
label_column = 'SentimentLabelTitle'
dataset_filtered = dataset[[review_text_column, label_column]].dropna().iloc[1:].reset_index(drop=True)
dataset_filtered[label_column] = dataset_filtered[label_column].astype(int)

In [7]:
# Split data into features and labels
X = dataset_filtered[review_text_column]
y = dataset_filtered[label_column]


In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [10]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(random_state=12),
    'SVM': SVC(random_state=12)
}


In [12]:
# Hyperparameter tuning for each model
param_grid = {
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    }
}
best_models = {}
best_scores = {}

for model_name, model in models.items():
    grid = GridSearchCV(model, param_grid[model_name], cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train_tfidf, y_train)
    best_models[model_name] = grid.best_estimator_
    best_scores[model_name] = grid.best_score_


In [13]:
# Evaluate the best model on test data
best_model_name = max(best_scores, key=best_scores.get)
best_model = best_models[best_model_name]
y_pred = best_model.predict(X_test_tfidf)

In [14]:
# Calculate and display accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [15]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6916666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.37      0.48      1614
           1       0.69      0.89      0.78      2586

    accuracy                           0.69      4200
   macro avg       0.69      0.63      0.63      4200
weighted avg       0.69      0.69      0.67      4200



In [18]:
# Save model and vectorizer for deployment
joblib.dump(best_model, 'fake_review_detector_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [19]:
dataset.head()

Unnamed: 0,Index,Unnamed0_3,Unnamed0_2,Unnamed0_1,Unnamed0,Rating,VerifiedPurchase,ReviewTitle,ReviewText,NumNouns,...,CapitalCharCount,PunctuationCount,PreprocessedReviewText,WordCount,SentimentScoreTitle,SentimentLabelTitle,AvgRatingVerified,AvgRatingNonVerified,DeviationVerified,DeviationNonVerified
0,,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,RATING,VERIFIED_PURCHASE,REVIEW_TITLE,REVIEW_TEXT,NUM_NOUNS,...,CAPITAL_CHAR_COUNT,PUNCTUATION_COUNT,PREPROCESSED_REVIEW_TEXT,WORD_COUNT,SENTIMENT_SCORE_TITLE,SENTIMENT_LABEL_TITLE,AVG_RATING_VERIFIED,AVG_RATING_NON_VERIFIED,DEVIATION_VERIFIED,DEVIATION_NON_VERIFIED
1,0.0,0,0,0,0,4,0,useful,think so product save day case need someth,4,...,0,0,think product save day case need someth,8,0.4404,1,4.243802359377671,3.9822618791657707,0.24380235937767125,0.01773812083422932
2,1.0,1,1,1,1,4,1,New era for batteries,lithium batteri new introduc market averag dev...,15,...,0,0,lithium batteri new introduc market averag dev...,37,0.0,0,4.243802359377671,3.9822618791657707,0.24380235937767125,0.01773812083422932
3,2.0,2,2,2,2,3,0,doesn't swing very well.,purchas swing babi 6 month pretti grow -pron- ...,7,...,0,2,purcha swing babi 6 month pretti grow -pron- l...,24,-0.2572,0,4.243802359377671,3.9822618791657707,1.2438023593776713,0.9822618791657707
4,3.0,3,3,3,3,4,0,Great computing!,look inexpen desk calcolatur be work ne issu t...,8,...,0,0,look inexpen desk calcolatur work ne issu tilt...,17,0.6588,1,4.243802359377671,3.9822618791657707,0.24380235937767125,0.01773812083422932
