<a href="https://colab.research.google.com/github/hemanthReddy365/fake_review_prediction/blob/main/fake_review_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install nltk

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

data = pd.read_csv('/content/ai generated vs human written reviews dataset.csv')

def preprocess_text(text):

    text = text.lower()


    text = re.sub(r'[^a-zA-Z\s]', '', text)


    tokens = word_tokenize(text)


    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]


    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)


data['processed_text'] = data['text'].apply(preprocess_text)


X = data['processed_text']
y = data['AI_generated'].map({'Yes': 1, 'No': 0})


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('svm', SVC(kernel='linear', probability=True, random_state=42))
])


pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


cv_scores = cross_val_score(pipeline, X, y, cv=2)
print("\nCross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())


def predict_text(text):

    processed_text = preprocess_text(text)


    prediction = pipeline.predict([processed_text])[0]
    probability = pipeline.predict_proba([processed_text])[0]

    return prediction, probability


def get_user_prediction():
    while True:
        print("\nEnter a review to classify (or 'quit' to exit):")
        user_input = input()

        if user_input.lower() == 'quit':
            break

        prediction, probability = predict_text(user_input)
        confidence = probability[1] if prediction == 1 else probability[0]

        print("\nResult:")
        print("Classification:", "AI-generated" if prediction == 1 else "Human-written")
        print(f"Confidence: {confidence:.2%}")


print("\nModel is ready for predictions!")
get_user_prediction()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Model Accuracy: 0.6666666666666666

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3


Cross-validation scores: [0.33333333 0.5       ]
Average CV score: 0.41666666666666663

Model is ready for predictions!

Enter a review to classify (or 'quit' to exit):
what  is ai

Result:
Classification: AI-generated
Confidence: 50.00%

Enter a review to classify (or 'quit' to exit):
quit
