<a href="https://colab.research.google.com/github/jessieicesk8s/CLPS0950FinalProject/blob/main/CLPS0950FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Hello! Welcome to Jessica's Language Detection Tool!

Run the cell below, and you will be prompted to input some text. Write a few sentences worth of text in one of the following languages (English, Spanish, French, or Brazilian Portuguese) and this code will reveal which of the four languages your text was written in!

In [None]:
#Import the counter tool to count frequency of trigrams
from collections import Counter

#Create sample text for each language to establish base patterns and trigrams
samples = {
    "English": "This is a simple piece of English text to create a basic profile for language detection.",
    "Español/Spanish": "Este es un fragmento simple de texto en español para crear un perfil básico para la detección del idioma.",
    "Français/French": "Il s'agit d'un simple extrait de texte français permettant de créer un profil de base pour la détection de la langue.",
    "Português/Portuguese": "Este é um texto simples em português para criar um perfil básico para detecção de idioma."}

#Turn uppercase to lowercase and remove spaces from text to create accurate trigrams
#Extract said trigrams from input text
def get_trigrams(input_text):
  input_text = input_text.lower().replace(" ", "")
  return [input_text[i:i+3] for i in range(len(input_text) - 2)]

#Count frequency of each trigram and identify the most frequent trigrams (rank them in order of frequency)
def build_profile(input_text, top_n=100):
  trigrams = get_trigrams(input_text)
  freq = Counter(trigrams)
  most_common = freq.most_common(top_n)
  return {trigram: rank for rank, (trigram, _) in enumerate(most_common)}

#Compare the input text to existing language profiles as created above
#Compare frequency of trigrams and add up the differences of the matching trigrams
#Add an optional penalty for trigrams that don't exist in existing language profile to differentiate the scores; the smaller the score, the more likely it is that the input text matches the existing, detected language
def compare_profiles(input_profile, lang_profile, penalty=200):
  distance = 0
  for trigram, rank in input_profile.items():
    if trigram in lang_profile:
      distance += abs(rank-lang_profile[trigram])
    else:
      distance += penalty
  return distance

#Officially create a profile for the input text to compare to the existing profiles
#Using the distance scores, the lowest score will reveal the closest trigram match
#The lowest trigram match *should* reveal the correct language
def detect_language(input_text, profiles):
  input_profile = build_profile(input_text)
  scores = {}
  for lang, profile in language_profiles.items():
    score = compare_profiles(input_profile, profile)
    scores[lang] = score
  best_match = min(scores, key=scores.get)
  return best_match, scores

#Build profiles from sample texts
language_profiles = {lang: build_profile(text) for lang, text in samples.items()}
#Instruct user to input text
input_text = input("Enter text: ")
#Analyze the text profiles and match up the closest scores
detected, all_scores = detect_language(input_text, language_profiles)
#Print the detected language for the user to see
print(f"\nDetected Language: {detected}")