In [None]:
from google.colab import files
uploaded = files.upload()

Saving preprocessed fake review dataset.csv to preprocessed fake review dataset (1).csv


In [None]:


import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    RobertaModel,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import warnings
import json
from tqdm import tqdm
import os
from torch.optim import AdamW
warnings.filterwarnings('ignore')

class ReviewDataset(Dataset):
    """Custom Dataset with optional label smoothing"""
    def __init__(self, texts, labels, tokenizer, max_length=512, label_smoothing=0.0):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_smoothing = label_smoothing

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class ImprovedFakeReviewDetector:
    """Enhanced detector with multiple strategies for better accuracy"""

    def __init__(self, model_name='roberta-base', num_labels=2, device=None, dropout_rate=0.3):
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.num_labels = num_labels

        print(f"Using device: {self.device}")
        print(f"Loading {model_name}...")

        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)

        # Model with configurable dropout
        self.model = RobertaForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            hidden_dropout_prob=dropout_rate,
            attention_probs_dropout_prob=dropout_rate
        )
        self.base_model = RobertaModel.from_pretrained(model_name)

        self.model.to(self.device)
        self.base_model.to(self.device)

        print("Model loaded successfully!")

    def prepare_data(self, df, text_column='clean_text', label_column='label',
                     test_size=0.2, val_size=0.1, random_state=42):
        """Prepare data with class balance analysis"""
        print("\n=== Data Preparation ===")
        print(f"Total samples: {len(df)}")

        df = df.dropna(subset=[text_column, label_column])

        texts = df[text_column].values
        labels = df[label_column].values

        # Analyze class distribution
        unique_labels, counts = np.unique(labels, return_counts=True)
        print(f"\nLabel distribution:")
        for label, count in zip(unique_labels, counts):
            label_name = "Real (OR)" if label == 0 else "Fake (CG)"
            print(f"  {label_name}: {count} samples ({count/len(labels)*100:.1f}%)")

        # Calculate class weights for imbalanced data
        class_weights = compute_class_weight(
            class_weight='balanced',
            classes=np.unique(labels),
            y=labels
        )
        self.class_weights = torch.FloatTensor(class_weights).to(self.device)
        print(f"\nClass weights (for imbalance): {class_weights}")

        # Split data
        X_temp, X_test, y_temp, y_test = train_test_split(
            texts, labels, test_size=test_size, random_state=random_state, stratify=labels
        )

        val_ratio = val_size / (1 - test_size)
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y_temp, test_size=val_ratio, random_state=random_state, stratify=y_temp
        )

        print(f"\nTrain samples: {len(X_train)}")
        print(f"Validation samples: {len(X_val)}")
        print(f"Test samples: {len(X_test)}")
        print(f"Train distribution: Real={np.sum(y_train==0)}, Fake={np.sum(y_train==1)}")

        return X_train, X_val, X_test, y_train, y_val, y_test

    def create_data_loaders(self, X_train, X_val, X_test, y_train, y_val, y_test,
                           batch_size=16, max_length=256, use_weighted_sampler=True):
        """Create DataLoaders with optional weighted sampling"""
        train_dataset = ReviewDataset(X_train, y_train, self.tokenizer, max_length)
        val_dataset = ReviewDataset(X_val, y_val, self.tokenizer, max_length)
        test_dataset = ReviewDataset(X_test, y_test, self.tokenizer, max_length)

        # Weighted sampler for imbalanced classes
        if use_weighted_sampler:
            class_sample_count = np.array([len(np.where(y_train == t)[0]) for t in np.unique(y_train)])
            weight = 1. / class_sample_count
            samples_weight = np.array([weight[t] for t in y_train])
            samples_weight = torch.from_numpy(samples_weight)
            sampler = WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight))

            train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
            print("Using weighted sampler for balanced training")
        else:
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        return train_loader, val_loader, test_loader

    def train(self, train_loader, val_loader, epochs=8, learning_rate=5e-6,
              warmup_steps=200, weight_decay=0.01, use_class_weights=True,
              patience=3, min_delta=0.001):



        print("\n=== IMPROVED Training Configuration ===")
        print(f"Epochs: {epochs} (INCREASED)")
        print(f"Learning rate: {learning_rate} (SLOWER)")
        print(f"Warmup steps: {warmup_steps} (MORE)")
        print(f"Weight decay: {weight_decay}")
        print(f"Using class weights: {use_class_weights}")
        print(f"Early stopping patience: {patience}")

        optimizer = AdamW(
            self.model.parameters(),
            lr=learning_rate,
            eps=1e-8,
            weight_decay=weight_decay
        )

        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_steps
        )

        # Loss function with class weights
        if use_class_weights:
            criterion = nn.CrossEntropyLoss(weight=self.class_weights)
            print(f"Using weighted loss with weights: {self.class_weights}")
        else:
            criterion = nn.CrossEntropyLoss()

        best_val_accuracy = 0
        best_model_state = None
        patience_counter = 0
        history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

        for epoch in range(epochs):
            print(f"\n{'='*60}")
            print(f"Epoch {epoch + 1}/{epochs}")
            print('='*60)

            # Training phase
            self.model.train()
            total_train_loss = 0
            train_predictions = []
            train_labels = []

            progress_bar = tqdm(train_loader, desc=f"Training")
            for step, batch in enumerate(progress_bar):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                optimizer.zero_grad()

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                logits = outputs.logits

                # Use custom criterion with class weights
                loss = criterion(logits, labels)
                total_train_loss += loss.item()

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

                predictions = torch.argmax(logits, dim=-1)
                train_predictions.extend(predictions.cpu().numpy())
                train_labels.extend(labels.cpu().numpy())

                progress_bar.set_postfix({'loss': loss.item()})

            avg_train_loss = total_train_loss / len(train_loader)
            train_accuracy = accuracy_score(train_labels, train_predictions)

            # Validation phase
            val_loss, val_accuracy = self.evaluate(val_loader, criterion)

            history['train_loss'].append(avg_train_loss)
            history['train_acc'].append(train_accuracy)
            history['val_loss'].append(val_loss)
            history['val_acc'].append(val_accuracy)

            print(f"\nEpoch {epoch + 1} Summary:")
            print(f"  Train Loss: {avg_train_loss:.4f} | Train Accuracy: {train_accuracy:.4f}")
            print(f"  Val Loss: {val_loss:.4f} | Val Accuracy: {val_accuracy:.4f}")

            # Save best model and early stopping
            if val_accuracy > best_val_accuracy + min_delta:
                best_val_accuracy = val_accuracy
                best_model_state = self.model.state_dict().copy()
                patience_counter = 0
                print(f"  ✓ New best validation accuracy: {best_val_accuracy:.4f}")
            else:
                patience_counter += 1
                print(f"  No improvement (patience: {patience_counter}/{patience})")

                if patience_counter >= patience:
                    print(f"\n⚠️  Early stopping triggered after {epoch + 1} epochs")
                    break

        # Load best model
        if best_model_state is not None:
            self.model.load_state_dict(best_model_state)
            print(f"\n{'='*60}")
            print(f"Loaded best model with validation accuracy: {best_val_accuracy:.4f}")
            print('='*60)

        return history

    def evaluate(self, data_loader, criterion=None):
        """Evaluate the model"""
        self.model.eval()
        total_loss = 0
        predictions = []
        true_labels = []

        if criterion is None:
            criterion = nn.CrossEntropyLoss()

        with torch.no_grad():
            for batch in data_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                logits = outputs.logits
                loss = criterion(logits, labels)
                total_loss += loss.item()

                batch_predictions = torch.argmax(logits, dim=-1)
                predictions.extend(batch_predictions.cpu().numpy())
                true_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(data_loader)
        accuracy = accuracy_score(true_labels, predictions)

        return avg_loss, accuracy

    def predict_single_review(self, review_text, max_length=256):
        """Predict single review"""
        self.model.eval()

        encoding = self.tokenizer(
            review_text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=-1)

            prediction = torch.argmax(logits, dim=-1).item()
            confidence = torch.max(probs, dim=-1)[0].item()

            real_prob = probs[0][0].item()
            fake_prob = probs[0][1].item()

        return {
            'prediction': prediction,
            'label': 'Real (OR)' if prediction == 0 else 'Fake (CG)',
            'confidence': confidence,
            'real_probability': real_prob,
            'fake_probability': fake_prob
        }

    def get_detailed_report(self, data_loader):
        """Get detailed classification report"""
        self.model.eval()
        predictions = []
        true_labels = []
        confidences = []

        with torch.no_grad():
            for batch in data_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                probs = torch.softmax(logits, dim=-1)

                batch_predictions = torch.argmax(logits, dim=-1).cpu().numpy()
                batch_confidences = torch.max(probs, dim=-1)[0].cpu().numpy()

                predictions.extend(batch_predictions)
                true_labels.extend(labels.cpu().numpy())
                confidences.extend(batch_confidences)

        accuracy = accuracy_score(true_labels, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(
            true_labels, predictions, average='weighted'
        )

        print(f"\n{'='*60}")
        print("FINAL TEST RESULTS")
        print('='*60)
        print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")

        print("\n" + "="*60)
        print("Classification Report:")
        print("="*60)
        print(classification_report(
            true_labels, predictions,
            target_names=['Real (OR)', 'Fake (CG)']
        ))

        print("\nConfusion Matrix:")
        cm = confusion_matrix(true_labels, predictions)
        print(cm)
        print("\nInterpretation:")
        print(f"  True Real: {cm[0][0]}")
        print(f"  False Fake: {cm[0][1]}")
        print(f"  False Real: {cm[1][0]}")
        print(f"  True Fake: {cm[1][1]}")

        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'predictions': predictions,
            'true_labels': true_labels,
            'confidences': confidences
        }

    def save_model(self, output_dir):
        """Save model and tokenizer"""
        os.makedirs(output_dir, exist_ok=True)
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)
        print(f"\n✓ Model saved to {output_dir}")

    def load_model(self, model_dir):
        """Load saved model"""
        self.model = RobertaForSequenceClassification.from_pretrained(model_dir)
        self.tokenizer = RobertaTokenizer.from_pretrained(model_dir)
        self.model.to(self.device)
        print(f"✓ Model loaded from {model_dir}")


def main():

    print("="*60)
    print("="*60)

    # Load dataset
    file_path = 'preprocessed fake review dataset.csv'

    try:
        if file_path.endswith('.xlsx'):
            df = pd.read_excel(file_path)
        else:
            df = pd.read_csv(file_path)
        print(f"✓ Dataset loaded: {df.shape}")
    except:
        import glob
        files = glob.glob('/mnt/user-data/uploads/*.csv')
        if files:
            file_path = files[0]
            df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
        else:
            raise Exception("No dataset file found")

    # Map labels
    label_mapping = {'OR': 0, 'CG': 1}
    df['label'] = df['ABC'].map(label_mapping)
    df = df.dropna(subset=['label'])

    print(f"Labels mapped:")
    print(f"  OR (Real): {(df['label']==0).sum()}")
    print(f"  CG (Fake): {(df['label']==1).sum()}")

    # Initialize detector
    detector = ImprovedFakeReviewDetector(
        model_name='roberta-base',
        num_labels=2,
        dropout_rate=0.2  # Reduced dropout for better learning
    )

    # Prepare data
    X_train, X_val, X_test, y_train, y_val, y_test = detector.prepare_data(
        df,
        text_column='clean_text',
        label_column='label',
        test_size=0.2,
        val_size=0.1
    )

    # Create data loaders with weighted sampling
    train_loader, val_loader, test_loader = detector.create_data_loaders(
        X_train, X_val, X_test, y_train, y_val, y_test,
        batch_size=16,
        max_length=256,
        use_weighted_sampler=True  # Handle class imbalance
    )

    # IMPROVED TRAINING
    history = detector.train(
        train_loader,
        val_loader,
        epochs=8,
        learning_rate=5e-6,
        warmup_steps=200,
        weight_decay=0.01,
        use_class_weights=True,  # Handle imbalance

    )

    # Evaluate
    results = detector.get_detailed_report(test_loader)

    # Save model
    detector.save_model('/mnt/user-data/outputs/improved_fake_review_model')

    # Save results
    results_summary = {
        'task': 'Fake Review Detection (OR vs CG)',
        'final_accuracy': float(results['accuracy']),
        'precision': float(results['precision']),
        'recall': float(results['recall']),
        'f1_score': float(results['f1']),
        'training_config': {
            'epochs': 8,
            'learning_rate': 5e-6,
            'warmup_steps': 200,
            'class_weights': 'enabled',
            'weighted_sampler': 'enabled',
            'early_stopping': 'enabled'
        },
        'training_history': {
            'train_accuracy': [float(x) for x in history['train_acc']],
            'val_accuracy': [float(x) for x in history['val_acc']],
            'train_loss': [float(x) for x in history['train_loss']],
            'val_loss': [float(x) for x in history['val_loss']]
        }
    }

    with open('/mnt/user-data/outputs/improved_training_results.json', 'w') as f:
        json.dump(results_summary, f, indent=2)

    print(f"\n{'='*60}")
    print("FINAL SUMMARY")
    print('='*60)
    print(f"✓Model saved to: /mnt/user-data/outputs/improved_fake_review_model")
    print(f"Final Test Accuracy: {results['accuracy']:.4f} ({results['accuracy']*100:.2f}%)")
    print(f" Final Validation Accuracy: {history['val_acc'][-1]:.4f} ({history['val_acc'][-1]*100:.2f}%)")

    if results['accuracy'] >= 0.85:
        print("\n SUCCESS! Achieved 85%+ accuracy!")
    elif results['accuracy'] >= 0.75:
        print(f"\n💡 Good progress! {results['accuracy']*100:.1f}% accuracy")
        print("   Try: More epochs or different hyperparameters")
    else:
        print(f"\n  {results['accuracy']*100:.1f}% accuracy")
        print("   Dataset might be very difficult")
        print("   Run diagnose_dataset.py to analyze the problem")

    return detector, results


if __name__ == "__main__":
    detector, results = main()


✓ Dataset loaded: (40401, 6)
Labels mapped:
  OR (Real): 20215
  CG (Fake): 20186
Using device: cuda
Loading roberta-base...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.bias         | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.weight         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.bias         | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
pooler.dense.bias               | MISSING    | 
pooler.dense.weight             | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Model loaded successfully!

=== Data Preparation ===
Total samples: 40401

Label distribution:
  Real (OR): 20215 samples (50.0%)
  Fake (CG): 20185 samples (50.0%)

Class weights (for imbalance): [0.99925798 1.00074313]

Train samples: 28280
Validation samples: 4040
Test samples: 8080
Train distribution: Real=14150, Fake=14130
Using weighted sampler for balanced training

=== IMPROVED Training Configuration ===
Epochs: 8 (INCREASED)
Learning rate: 5e-06 (SLOWER)
Warmup steps: 200 (MORE)
Weight decay: 0.01
Using class weights: True
Early stopping patience: 3
Using weighted loss with weights: tensor([0.9993, 1.0007], device='cuda:0')

Epoch 1/8


Training: 100%|██████████| 1768/1768 [21:46<00:00,  1.35it/s, loss=0.346]



Epoch 1 Summary:
  Train Loss: 0.4403 | Train Accuracy: 0.7748
  Val Loss: 0.8675 | Val Accuracy: 0.7176
  ✓ New best validation accuracy: 0.7176

Epoch 2/8


Training: 100%|██████████| 1768/1768 [21:47<00:00,  1.35it/s, loss=0.148]



Epoch 2 Summary:
  Train Loss: 0.3003 | Train Accuracy: 0.8658
  Val Loss: 0.4085 | Val Accuracy: 0.8448
  ✓ New best validation accuracy: 0.8448

Epoch 3/8


Training: 100%|██████████| 1768/1768 [21:55<00:00,  1.34it/s, loss=0.26]



Epoch 3 Summary:
  Train Loss: 0.2697 | Train Accuracy: 0.8812
  Val Loss: 0.5468 | Val Accuracy: 0.8282
  No improvement (patience: 1/3)

Epoch 4/8


Training: 100%|██████████| 1768/1768 [21:54<00:00,  1.35it/s, loss=0.279]



Epoch 4 Summary:
  Train Loss: 0.2452 | Train Accuracy: 0.8974
  Val Loss: 0.5859 | Val Accuracy: 0.8054
  No improvement (patience: 2/3)

Epoch 5/8


Training: 100%|██████████| 1768/1768 [21:54<00:00,  1.35it/s, loss=0.103]



Epoch 5 Summary:
  Train Loss: 0.2293 | Train Accuracy: 0.9038
  Val Loss: 0.8025 | Val Accuracy: 0.8002
  No improvement (patience: 3/3)

⚠️  Early stopping triggered after 5 epochs

Loaded best model with validation accuracy: 0.8448

FINAL TEST RESULTS
Accuracy: 0.8054 (80.54%)
Precision: 0.8502
Recall: 0.8054
F1-Score: 0.7991

Classification Report:
              precision    recall  f1-score   support

   Real (OR)       0.98      0.63      0.76      4043
   Fake (CG)       0.72      0.98      0.83      4037

    accuracy                           0.81      8080
   macro avg       0.85      0.81      0.80      8080
weighted avg       0.85      0.81      0.80      8080


Confusion Matrix:
[[2535 1508]
 [  64 3973]]

Interpretation:
  True Real: 2535
  False Fake: 1508
  False Real: 64
  True Fake: 3973


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


✓ Model saved to /mnt/user-data/outputs/improved_fake_review_model

FINAL SUMMARY
✓Model saved to: /mnt/user-data/outputs/improved_fake_review_model
Final Test Accuracy: 0.8054 (80.54%)
 Final Validation Accuracy: 0.8002 (80.02%)

💡 Good progress! 80.5% accuracy
   Try: More epochs or different hyperparameters
