In [None]:
print("Using Ngram")

def extract_ngrams(text:str, n:int)-> list:
    '''
    Given a text and an integer n, returns a list of all n-grams in the text.
    
    Args:
        text: The input text as a string.
        n: The length of the n-grams as an integer.
        
    Returns:
        A list of all n-grams in the text.
    '''
    ngrams = []
    for i in range(len(text)-n+1):
        ngrams.append(text[i:i+n])
    return ngrams


# Create a frequency table for each language
n = 3  # The length of the n-grams
freq_tables = {}
for lang in df['Language'].unique():
    texts = df.loc[df['Language'] == lang, 'Text']
    freq_table = {}
    for text in texts:
        # Extract all n-grams from the text
        ngrams = extract_ngrams(text, n)
        for ngram in ngrams:
            # Increment the count for the n-gram in the frequency table
            freq_table[ngram] = freq_table.get(ngram, 0) + 1
        
        # Normalize the frequency table by the total number of n-grams in the text
        total_ngrams = sum(freq_table.values())
        for ngram in freq_table:
            freq_table[ngram] /= total_ngrams
        
    # Add the frequency table for the language to the dictionary
    freq_tables[lang] = freq_table


# Make predictions for each text in the dataset
predictions = []
for i, text in enumerate(df['Text']):
    # Extract all n-grams from the text
    ngrams = extract_ngrams(text, n)
    
    # Compute the probability of the text belonging to each language
    probs = {}
    for lang in freq_tables:
        prob = 1
        for ngram in ngrams:
            # Multiply the probability of each n-gram in the text belonging to the language
            prob *= freq_tables[lang].get(ngram, 0)
        probs[lang] = prob
        
    # Choose the language with the highest probability as the prediction for the text
    lang_pred = max(probs, key=probs.get)
    predictions.append(lang_pred)

# Add the predictions to the dataframe
df['Prediction'] = predictions
multiclass_report(df['Language'], df['Prediction'], df['Language'].unique())