In [None]:
!pip install Levenshtein

Collecting Levenshtein
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.27.1 rapidfuzz-3.13.0


In [None]:
# -*- coding: utf-8 -*-
"""TextBaseLineedited.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1AWwopP91Py_f6pwfZqvNsXWi94g5EwzF
"""

import os
import requests
import json
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
import Levenshtein
import nltk
from google.colab import drive

# Download NLTK resources
nltk.download('punkt')

# Mount Google Drive
drive.mount('/content/drive')

# Set API keys
os.environ['togetherapi'] = '30502d26c70d2d73887e3fde4f371413768a0c2f19a5024aa768c67e27022eff'
os.environ['fireapi'] = 'fw_3ZSV7jbYCrP2D8uyXbd32z2R'
together_apikey = os.environ.get('togetherapi')
firework_apikey = os.environ.get('fireapi')

# Define models
together_ai_models = [
    "Qwen/Qwen2-72B-Instruct",
    # "google/gemma-2-9b-it",
    "mistralai/Mistral-7B-Instruct-v0.1",
    "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
]

class Score:
    def __init__(self, edit_score, new_text, model):
        self.edit_score = edit_score
        self.new_text = new_text
        self.model = model

def togetherai(question, model, api_key=together_apikey):
    url = "https://api.together.xyz/v1/chat/completions"
    formatted_prompt = f"Regenerate provided text: TEXT = {question}"
    payload = {
        "model": model,
        "messages": [{"role": "user", "content": formatted_prompt}],
        "max_tokens": 1024,
        "temperature": 0.5,
    }

    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    response = requests.post(url, headers=headers, data=json.dumps(payload))
    if response.status_code == 200:
        return response.json()['choices'][0]['message']['content']
    else:
        print(f"TogetherAI Error: {response.status_code}, {response.text}")
        return None

def fireworks(question, api_key=firework_apikey, model="accounts/yi-01-ai/models/yi-large"):
    url = "https://api.fireworks.ai/inference/v1/chat/completions"
    formatted_prompt = f"Regenerate the text: TEXT={question}\n"
    payload = {
        "model": model,
        "messages": [{"role": "user", "content": formatted_prompt}],
        "max_tokens": 1024,
        "temperature": 0.5,
    }

    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    response = requests.post(url, headers=headers, data=json.dumps(payload))
    if response.status_code == 200:
        return response.json()['choices'][0]['message']['content']
    else:
        print(f"Fireworks Error: {response.status_code}, {response.text}")
        return None

def get_edit_distance(text1, text2):
    tokens1 = word_tokenize(text1)
    tokens2 = word_tokenize(text2)
    joined1 = " ".join(tokens1)
    joined2 = " ".join(tokens2)
    return Levenshtein.distance(joined1, joined2)

def detect_text(sentence):
    edit_distance_score = []
    # Process using each Together AI model
    for model in together_ai_models:
        new_text = togetherai(sentence, model)
        if new_text:
            edit_score = get_edit_distance(sentence, new_text)
            edit_distance_score.append(Score(edit_score, new_text, model))

    # Process using the Fireworks model
    new_text_fw = fireworks(sentence)
    if new_text_fw:
        edit_score_fw = get_edit_distance(sentence, new_text_fw)
        edit_distance_score.append(Score(edit_score_fw, new_text_fw, "Yi-Large"))

    return edit_distance_score

# Load CSV data
df = pd.read_csv('/content/updated_test_data 2.csv')
texts = df["Text"].astype(str).tolist()

# Prepare output data
output_data = []

# Process each text and split into sentences
for text_idx, text in enumerate(texts):
    nltk.download('punkt_tab')
    sentences = sent_tokenize(text)
    for sent_idx, sentence in enumerate(sentences):
        print(f"\nProcessing Sentence {text_idx+1}.{sent_idx+1}: {sentence[:50]}...")
        try:
            results = detect_text(sentence)
            if not results:
                print("No valid results for this sentence.")
                continue
            best_result = min(results, key=lambda x: x.edit_score)
            output_data.append({
                'sentence_number': f"{text_idx+1}.{sent_idx+1}",
                'llm': best_result.model
            })
        except Exception as e:
            print(f"Error processing sentence {text_idx+1}.{sent_idx+1}: {e}")

# Export to CSV
results_df = pd.DataFrame(output_data)
results_df.to_csv('/content/drive/MyDrive/llm_detection_results.csv', index=False)
print("Results saved to llm_detection_results.csv")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyboardInterrupt: 