<a href="https://colab.research.google.com/github/comanchegenerate/ComancheSynthetic/blob/main/language_identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Original Zero Shot Tests

In [None]:
import pandas as pd
import openai
import random
from collections import defaultdict

In [None]:
# Load dataset
df = pd.read_csv("dataset") # Replace with actualt dataset name


# Group sentences by language
language_data = defaultdict(list)
language_data["Comanche"] = df["Comanche"].dropna().tolist()

print(language_data)

In [None]:
# Split dataset
zero_shot_samples = {lang: sentences[:len(language_data)] for lang, sentences in language_data.items()}

print(zero_shot_samples)

In [None]:
# Initialize OpenAI client
client = openai.OpenAI(api_key="***********")  # Replace with your actual API key

In [None]:
def query_gpt(prompt):
    response = client.chat.completions.create(
        model="gpt-4o",  # Use GPT-4o or your preferred model
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

In [None]:
# Zero-Shot Evaluation
zero_shot_results = []
for lang, test_sentences in zero_shot_samples.items():
    for sentence in test_sentences:
        prompt = f"You are a linguistics expert who knows every single language that exists in this world. What language is this sentence in?\n\nSentence: {sentence}. Reply with only the language itself and nothing else."
        predicted_lang = query_gpt(prompt)
        zero_shot_results.append([lang, sentence, predicted_lang])

In [None]:
# Convert results to DataFrame and save
zero_shot_df = pd.DataFrame(zero_shot_results, columns=["True Language", "Sentence", "Predicted Language"])

# Save outputs
zero_shot_df.to_csv("comanche_id_zero_shot.csv", index=False)

print("Experiment complete! Results saved.")

# Few-Shot Testing with Comanche and English

In [None]:
import random
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
import openai

# Load your dataset
df = pd.read_csv("dataset")  # Replace with actual dataset name

# Group sentences by language
language_data = defaultdict(list)
language_data["Comanche"] = df["Comanche"].dropna().tolist()
language_data["English"] = df["English"].dropna().tolist()

# Organize and shuffle the data
comanche_data = language_data["Comanche"]
eng_data = language_data["English"]

random.shuffle(comanche_data)
random.shuffle(eng_data)

# Set few-shot and validation samples
few_shot_size = 5

# Designate few-shot samples and validation set for Comanche and English from shuffled list
comanche_few_shot_samples = comanche_data[:few_shot_size]
comanche_validation_samples = comanche_data[few_shot_size:105]

eng_few_shot_samples = eng_data[:few_shot_size]
eng_validation_samples = eng_data[few_shot_size:105]

# Initialize OpenAI client
client = openai.OpenAI(api_key="********")  # Replace with your actual API key

# Function to query GPT models
def query_gpt(prompt):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

# Initialize results dictionary
model_name = "gpt-4o"
results = {
    model_name: {
        "Comanche": defaultdict(int),
        "English": defaultdict(int)
    }
}

# Iterate over number of few-shot examples
for n in range(few_shot_size + 1):  # 0 to 5

    # Prepare few-shot examples
    comanche_examples = "\n".join(
        [f"Sentence: {sent}\nLanguage: Comanche" for sent in comanche_few_shot_samples[:n]]
    )
    eng_examples = "\n".join(
        [f"Sentence: {sent}\nLanguage: English" for sent in eng_few_shot_samples[:n]]
    )
    few_shot_prompt = "\n".join([comanche_examples, eng_examples]).strip()

    # Evaluate Comanche validation sentences
    correct_comanche = 0
    for sent in comanche_validation_samples:
        prompt = f"""
{few_shot_prompt}

Sentence: {sent}
What language is this sentence in? Reply with only the language name.
"""
        predicted_lang = query_gpt(prompt).lower().strip()
        if predicted_lang == "comanche":
            correct_comanche += 1
    results[model_name]["Comanche"][n] = correct_comanche

    # Evaluate English validation sentences
    correct_english = 0
    for sent in eng_validation_samples:
        prompt = f"""
{few_shot_prompt}

Sentence: {sent}
What language is this sentence in? Reply with only the language name.
"""
        predicted_lang = query_gpt(prompt).lower().strip()
        if predicted_lang == "english":
            correct_english += 1
    results[model_name]["English"][n] = correct_english

    print(f"n={n}: Comanche Correct={correct_comanche}, English Correct={correct_english}")

# Convert results to DataFrame and save
results_list = []
for n in range(few_shot_size + 1):
    results_list.append({
        "Few-Shot Count": n,
        "Comanche Correct": results[model_name]["Comanche"][n],
        "English Correct": results[model_name]["English"][n]
    })

results_df = pd.DataFrame(results_list)
results_df.to_csv("comanche_classification_results.csv", index=False)
print("Results saved to CSV.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load results
results_df = pd.read_csv("resulting_csv")

# Set up plot
plt.figure(figsize=(10, 6))

# Plot data with markers
plt.plot(results_df['Few-Shot Count'],
         results_df['Comanche Correct'],
         color='#0000FE',
         marker='o',
         markersize=8,
         markeredgecolor='w',
         linewidth=2.5,
         label='Comanche')

plt.plot(results_df['Few-Shot Count'],
         results_df['English Correct'],
         color='#C80000',
         marker='s',
         markersize=8,
         markeredgecolor='w',
         linewidth=2.5,
         label='English')

# Configure axes
plt.title('Effect of Few-Shot Examples on Comanche vs English Predictions', fontsize=14)
plt.xlabel('Number of Few-Shot Examples', fontsize=12)
plt.ylabel('Correct Predictions', fontsize=12)
plt.xticks(results_df['Few-Shot Count'])
plt.ylim(0, 105)
plt.xlim(-0.1, 5.1)

# Add light grid
plt.grid(True,
        linestyle='--',
        linewidth=0.7,
        alpha=0.4)

# Add legend
plt.legend(loc='lower right', frameon=True)

# Save and show
plt.tight_layout()
plt.savefig('clean_marker_plot.png', dpi=300, transparent=False)
plt.show()