<a href="https://colab.research.google.com/github/comanchegenerate/ComancheSynthetic/blob/main/Comanche_Lang_ID.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Original Zero Shot Tests

In [None]:
import pandas as pd
import openai
import random
from collections import defaultdict

In [None]:
# Load dataset
df = pd.read_csv("dataset") # Replace with actualt dataset name


# Group sentences by language
language_data = defaultdict(list)
language_data["Comanche"] = df["Comanche"].dropna().tolist()

print(language_data)

In [None]:
# Split dataset
zero_shot_samples = {lang: sentences[:len(language_data)] for lang, sentences in language_data.items()}

print(zero_shot_samples)

In [None]:
# Initialize OpenAI client
client = openai.OpenAI(api_key="***********")  # Replace with your actual API key

In [None]:
def query_gpt(prompt):
    response = client.chat.completions.create(
        model="gpt-4o",  # Use GPT-4o or your preferred model
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

In [None]:
# Zero-Shot Evaluation
zero_shot_results = []
for lang, test_sentences in zero_shot_samples.items():
    for sentence in test_sentences:
        prompt = f"You are a linguistics expert who knows every single language that exists in this world. What language is this sentence in?\n\nSentence: {sentence}. Reply with only the language itself and nothing else."
        predicted_lang = query_gpt(prompt)
        zero_shot_results.append([lang, sentence, predicted_lang])

In [None]:
# Convert results to DataFrame and save
zero_shot_df = pd.DataFrame(zero_shot_results, columns=["True Language", "Sentence", "Predicted Language"])

# Save outputs
zero_shot_df.to_csv("comanche_id_zero_shot.csv", index=False)

print("Experiment complete! Results saved.")

# Iterate over an increasing number of few-shot prompts, chart the resulting success rates, and compare the performance of 4o versus 4o-mini.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
import openai

# Load your dataset
df = pd.read_csv("dataset.csv")  # Replace with name of actual dataset

# Group sentences by language
language_data = defaultdict(list)
language_data["Comanche"] = df["Comanche"].dropna().tolist()

# Set few-shot and validation samples
data = language_data["Comanche"]
few_shot_size = int(len(data) * 0.03)  # 3% for few-shot examples
few_shot_samples = data[:few_shot_size]
validation_samples = data[few_shot_size:]

print("Few-shot samples:", len(few_shot_samples))
print("Validation samples:", len(validation_samples))

# OpenAI API key setup (replace with your actual API key)
openai.api_key = "******"

# Function to query GPT models
def query_gpt(prompt, model):
    response = openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a language identification expert."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.0
    )
    return response.choices[0].message['content'].strip()

# Initialize results dictionary
results = {"gpt-4o": {}, "gpt-4o-mini": {}}

x = len(few_shot_samples) + 1 # Iterate from 0 samples to number of few shot samples

# Run tests for each model
for model in results.keys():
    print(f"\nRunning tests for model: {model}")
    for n in range(x):
        examples = "\n".join(
            [f"Example {i+1}: {sent} (Language: Comanche)"
             for i, sent in enumerate(few_shot_samples[:n])]
        )
        count_comanche = 0
        for sentence in validation_samples:
            prompt = f"""
Here are examples of sentences and their languages:
{examples}

Now, what language is this sentence in?

Sentence: {sentence}
Reply with only the language itself.
"""
            predicted_lang = query_gpt(prompt, model=model)
            if "comanche" in predicted_lang.lower():
                count_comanche += 1
        results[model][n] = count_comanche
        print(f"{model}, Few-shot examples: {n}, 'Comanche' predictions: {count_comanche}")

# Save results to CSV
final_results = []
for model, data in results.items():
    for n, count in data.items():
        final_results.append({
            "Model": model,
            "Few-Shot Count": n,
            "Comanche Predictions": count
        })

results_df = pd.DataFrame(final_results)
results_df.to_csv("comanche_comparison_results.csv", index=False)
print("Results saved to CSV")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))

x = list(range(0, len(few_shot_samples["Comanche"]) + 1))

plt.plot(x, [results["gpt-4o"].get(n, 0) for n in x],
         marker='o', linestyle='-', linewidth=2, label="GPT-4o", color="#0000FE")

plt.plot(x, [results["gpt-4o-mini"].get(n, 0) for n in x],
         marker='o', linestyle='-', linewidth=2, label="GPT-4o-mini", color="#C80000")


plt.xlabel('Number of Few-Shot Training Examples (n)', fontsize=12)
plt.ylabel('Number of \"Comanche\" Predictions (%)', fontsize=10)
plt.title('Effect of Few-Shot Examples on \"Comanche\" Predictions', fontsize=14)
plt.xticks(x)
plt.ylim(0, 105)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.legend(title="OpenAI Model")
plt.show()
