<a href="https://colab.research.google.com/github/karthikeyagade2/Mini_Project/blob/main/Text_Analysis_Tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio
!pip install requests



In [None]:
!pip install datasets



In [None]:
import gradio as gr
import requests
import json

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def load_and_combine_datasets():
    # Load multiple datasets
    df = pd.read_csv("/content/drive/MyDrive/Data_mp/Extra-Datasets-Used/Training_Essay_Data.csv")
    df2 = pd.read_csv("/content/drive/MyDrive/Data_mp/Extra-Datasets-Used/train_drcat_01.csv")
    df3 = pd.read_csv("/content/drive/MyDrive/Data_mp/Extra-Datasets-Used/train_essays_RDizzl3_seven_v1.csv")
    d1 = pd.read_csv("/content/drive/MyDrive/Data_mp/Extra-Datasets-Used/falcon_180b_v1.csv")
    d2 = pd.read_csv("/content/drive/MyDrive/Data_mp/Extra-Datasets-Used/llama_70b_v1.csv")
    d = pd.read_csv("/content/drive/MyDrive/Data_mp/Extra-Datasets-Used/LLM_generated_essay_PaLM.csv")

    # Standardize and combine datasets
    df2 = df2[['text', 'label']]
    df2.columns = ['text', 'generated']
    df3 = df3[['text', 'label']]
    df3.columns = ['text', 'generated']

    data = pd.concat([d1, d2], axis=0, ignore_index=True)
    data['generated'] = 1
    data.columns = ['text', 'writing_prompt', 'generated']
    data = data[['text', 'generated']]

    d['generated'] = d['generated'].astype(int)
    dd = d[['text', 'generated']]

    # Combine all datasets
    Train_Data = pd.concat([df3, df2, df, data, dd], axis=0, ignore_index=True)
    Train_Data.drop_duplicates(inplace=True, ignore_index=True)

    return Train_Data

In [None]:
def balance_labels(df, label_column='generated'):
    label_counts = df[label_column].value_counts()
    max_count = label_counts.max()
    balanced_df = pd.DataFrame()

    for label, count in label_counts.items():
        label_subset = df[df[label_column] == label]
        oversampled_subset = label_subset.sample(n=(max_count - count),
                                                 replace=True,
                                                 random_state=42)
        balanced_df = pd.concat([balanced_df, label_subset, oversampled_subset],
                                ignore_index=True)

    return balanced_df

In [None]:
def train_llm_detector():
    # Load and preprocess data
    Train_Data = load_and_combine_datasets()
    Train_Data = balance_labels(Train_Data)

    # Split data with a larger test size for faster training
    x_train, x_test, y_train, y_test = train_test_split(
        Train_Data.text,
        Train_Data.generated,
        test_size=0.2,  # Increased test size
        shuffle=True,
        random_state=42
    )

    # Tokenizer and Dataset Preparation
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            padding="max_length",
            truncation=True,
            max_length=256  # Reduced max length to speed up processing
        )

    train_df = pd.DataFrame({'text': x_train, 'labels': y_train})
    test_df = pd.DataFrame({'text': x_test, 'labels': y_test})

    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Model Initialization with Fewer Labels
    model = RobertaForSequenceClassification.from_pretrained(
        'roberta-base',
        num_labels=2
    )

    # Metrics Computation
    def compute_metrics(p):
        predictions, labels = p
        preds = np.argmax(predictions, axis=1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
        acc = accuracy_score(labels, preds)
        return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

    # Training Arguments (Optimized for Faster Training)
    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",
        per_device_train_batch_size=32,  # Increased batch size
        per_device_eval_batch_size=32,
        num_train_epochs=2,  # Reduced epochs
        learning_rate=2e-5,  # Slightly adjusted learning rate
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,  # More frequent logging
        save_total_limit=1,  # Limit checkpoint saves
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    # Train and Evaluate
    trainer.train()
    eval_results = trainer.evaluate()
    print(f"Evaluation results: {eval_results}")

    return model, tokenizer

In [None]:
trained_model, trained_tokenizer = train_llm_detector()

Map:   0%|          | 0/64297 [00:00<?, ? examples/s]

In [None]:
import gradio as gr
import requests
import json

def validate_text_length(text):
    """Helper function to validate text length"""
    if len(text) < 300:
        return False, "Error: Text must be at least 300 characters long."
    if len(text) > 300000:
        return False, "Error: Text cannot exceed 300,000 characters."
    return True, ""

def detect_ai_text(input_text):
    if not input_text.strip():
        return "Please enter some text to analyze."

    # Validate text length
    is_valid, error_message = validate_text_length(input_text)
    if not is_valid:
        return error_message

    url = "https://api.gowinston.ai/v2/ai-content-detection"
    payload = {"text": input_text}
    headers = {
        "Authorization": "Bearer kbOlnbOphXYGnbqe4NzoldsKTTcZ4xrxaeESzOC38711612d",
        "Content-Type": "application/json"
    }

    try:
        response = requests.post(url, json=payload, headers=headers)
        result = response.json()

        # Extract score from response
        if 'score' in result:
            score = result['score']
            return f"Human Score: {score}%"
        return "Unable to get score from response"
    except Exception as e:
        return f"Error: {str(e)}"

def check_plagiarism(input_text):
    if not input_text.strip():
        return "Please enter some text to analyze."

    # Validate text length
    is_valid, error_message = validate_text_length(input_text)
    if not is_valid:
        return error_message

    url = "https://api.gowinston.ai/v2/plagiarism"
    payload = {"text": input_text}
    headers = {
        "Authorization": "Bearer kbOlnbOphXYGnbqe4NzoldsKTTcZ4xrxaeESzOC38711612d",
        "Content-Type": "application/json"
    }

    try:
        response = requests.post(url, json=payload, headers=headers)
        result = response.json()

        # Extract score from the 'result' object
        if 'result' in result and 'score' in result['result']:
            score = result['result']['score']
            return f"Plagiarism Score: {score}%"
        return "Unable to get score from response"
    except Exception as e:
        return f"Error: {str(e)}"

# Create the Gradio interface
def create_interface():
    with gr.Blocks(title="Text Analysis Tool") as interface:
        gr.Markdown("# Text Analysis Tool")
        gr.Markdown("Note: Text must be between 300 and 300,000 characters.")

        # Text input
        text_input = gr.Textbox(
            label="Input Text",
            placeholder="Enter your text here (minimum 300 characters)...",
            lines=5
        )

        # Buttons
        with gr.Row():
            ai_button = gr.Button("AI Text Detector")
            plag_button = gr.Button("Plagiarism Checker")

        # Output
        output = gr.Textbox(label="Result", lines=1)

        # Connect buttons to functions
        ai_button.click(fn=detect_ai_text, inputs=text_input, outputs=output)
        plag_button.click(fn=check_plagiarism, inputs=text_input, outputs=output)

    return interface

# Launch the app
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a2f30d788fdb81ce7b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
