In [1]:
import pandas as pd

train_df = pd.read_csv('spanish_portugese_train.csv')
test_df = pd.read_csv('spanish_portugese_test.csv')


def ngrams(df, n):
    ngram_counts = {'spa': {}, 'por': {}}
    for i, data in df.iterrows():
        lan = data['lan_code']
        sentence = (data['sentence'])
        for i in range(len(sentence) - n + 1):
            ngram = sentence[i:i + n]

            if ngram in ngram_counts[lan]:
                ngram_counts[lan][ngram] += 1

            else:
                ngram_counts[lan][ngram] = 1
    return ngram_counts['spa'], ngram_counts['por']


In [2]:

def classify(df, n, ngrams_spa, ngrams_por):
    df['predicted_lang'] = ['not-predicted' for i in range(0, len(df['lan_code']))]
    df['spa_count'] = ['not-predicted' for i in range(0, len(df['lan_code']))]
    df['por_count'] = ['not-predicted' for i in range(0, len(df['lan_code']))]

    for index, data in df.iterrows():
        lan = data['lan_code']
        sentence = (data['sentence'])
        count_spa = 0

        count_por = 0
        for i in range(len(sentence) - n + 1):
            ngram = sentence[i:i + n]
            if ngram not in ngrams_spa:
                count_por += 1
            elif ngram not in ngrams_por:
                count_spa += 1
            elif ngrams_spa[ngram] < ngrams_por[ngram]:
                count_por += 1
            else:
                count_spa += 1
        if count_spa >= count_por:
            df.at[index, 'predicted_lang'] = 'spa'
            df.at[index, 'spa_count'] = count_spa
            df.at[index, 'por_count'] = count_por

        else:
            df.at[index, 'predicted_lang'] = 'por'
            df.at[index, 'spa_count'] = count_spa
            df.at[index, 'por_count'] = count_por


    return df


In [3]:
import json
spaish_ngrams, czech_ngrams = [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]
for i in range(1, 5):
    spaish_ngrams[i], czech_ngrams[i] = ngrams(train_df, i)


    ngramsp = json.dumps(spaish_ngrams[i])
    ngramsc = json.dumps(czech_ngrams[i])


    with open(f'spaish_{i}grams.json', 'w') as json_file:
        json.dump(ngramsp, json_file)
    with open(f'czech_{i}grams.json', 'w') as json_file:
        json.dump(ngramsc, json_file)


In [4]:

for i in range(1, 5):
    with open(f'spaish_{i}grams.json', 'r') as json_file:
        spaish_ngramss = json.load(ngramsp, json_file)
    with open(f'czech_{i}grams.json', 'r') as json_file:
        czech_ngramss = json.load(ngramsc, json_file)
    spclassified_df = classify(test_df, i, spaish_ngrams[i], czech_ngrams[i])
    # spclassified_df.to_csv(f'spclassified_without_normalisation_using_{i}_gram.csv', index=False)

In [5]:
count_spa = (train_df['lan_code'] == 'spa').sum()
count_por = (train_df['lan_code'] == 'por').sum()
print(count_por, count_spa)

318760 295536


In [6]:
for i in range(1,5):
    total_sump = sum(spaish_ngrams[i].values())
    total_sumc= sum(czech_ngrams[i].values())

    print("The total ngrams is:", total_sump, total_sumc)


The total ngrams is: 11789589 12344487
The total ngrams is: 11494053 12025727
The total ngrams is: 11198517 11706968
The total ngrams is: 10902981 11388212


In [7]:
spaish_ngrams_normalised = [0, 0, 0, 0, 0]
for i in range(1,5):
    spaish_ngrams_normalised[i] = spaish_ngrams[i].copy()
    sum_dict1 = sum(spaish_ngrams[i].values())
    sum_dict2 = sum(czech_ngrams[i].values())

    scaling_factor = sum_dict1 / sum_dict2

    for key in spaish_ngrams[i]:
        spaish_ngrams_normalised[i][key] /= scaling_factor



In [8]:


for i in range(1, 5):
    spclassified_df = classify(test_df, i, spaish_ngrams_normalised[i], czech_ngrams[i])
    spclassified_df.to_csv(f'spclassified_using_{i}_gram.csv', index=False)
    

In [9]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score
result = [0,0,0,0,0]
precision_scores = [0,0,0,0,0]
for i in range(1,5):
    spclassified_df = pd.read_csv(f'spclassified_without_normalisation_using_{i}_gram.csv')

    true = (list(spclassified_df['lan_code']))
    pred = (list(spclassified_df['predicted_lang']))


   
    print(f"{i}-gram &",precision_score(true, pred, average='weighted'), end=" & ")
    print(recall_score(true, pred, average='weighted'),end=" \\\\ \n")


1-gram & 0.7005732637213983 & 0.5213187128029484 \\ 
2-gram & 0.876277375334656 & 0.8761118418482295 \\ 
3-gram & 0.9511803643601784 & 0.9509943089325016 \\ 
4-gram & 0.9761608256581925 & 0.9761027257218019 \\ 


In [10]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score
result = [0,0,0,0,0]
precision_scores = [0,0,0,0,0]
precision_spa = [0,0,0,0,0]
recall_spa = [0,0,0,0,0]
precision_por = [0,0,0,0,0]
recall_por = [0,0,0,0,0]
for i in range(1,5):
    spclassified_df = pd.read_csv(f'spclassified_using_{i}_gram.csv')

    true = (list(spclassified_df['lan_code']))
    pred = (list(spclassified_df['predicted_lang']))
    # TP = len(spclassified_df[(spclassified_df['lan_code'] == 'spa') & (spclassified_df['predicted_lang'] == 'spa')])
    # TN = len(spclassified_df[(spclassified_df['lan_code'] != 'spa') & (spclassified_df['predicted_lang'] != 'spa')])
    # FP = len(spclassified_df[(spclassified_df['lan_code'] != 'spa') & (spclassified_df['predicted_lang'] == 'spa')])
    # FN = len(spclassified_df[(spclassified_df['lan_code'] == 'spa') & (spclassified_df['predicted_lang'] != 'spa')])

    # # Calculate precision and recall
    # precision_spa[i] = TP / (TP + FP) 
    # recall_spa[i] = TP / (TP + FN) 

    # print(TP,FP, TN,FN, precision_spa[i], recall_spa[i])
    # TP = len(spclassified_df[(spclassified_df['lan_code'] == 'por') & (spclassified_df['predicted_lang'] == 'por')])
    # TN = len(spclassified_df[(spclassified_df['lan_code'] != 'por') & (spclassified_df['predicted_lang'] != 'por')])
    # FP = len(spclassified_df[(spclassified_df['lan_code'] != 'por') & (spclassified_df['predicted_lang'] == 'por')])
    # FN = len(spclassified_df[(spclassified_df['lan_code'] == 'por') & (spclassified_df['predicted_lang'] != 'por')])

    # precision_por[i] = TP / (TP + FP) 
    # recall_por[i] = TP / (TP + FN) 
    # print((TP+TN)/(TP+TN+FP+FN))
    print(f"{i}-gram &",precision_score(true, pred, average='weighted'), end=" & ")
    print(recall_score(true, pred, average='weighted'),end=" \\\\ \n")

    # result[i] = spclassified_df[(spclassified_df['lan_code'] != spclassified_df['predicted_lang'])]
    # compare(result[i],i, spaish_ngrams_normalised[i], czech_ngrams[i]


1-gram & 0.6988153199542558 & 0.6986924870095198 \\ 
2-gram & 0.8721797790770908 & 0.869919387396304 \\ 
3-gram & 0.9468517655450945 & 0.9460325315483089 \\ 
4-gram & 0.9748565177203427 & 0.9748199565030539 \\ 


In [11]:

def classify_zero(df, n, ngrams_spa, ngrams_por):
    df['predicted_lang'] = ['not-predicted' for i in range(0, len(df['lan_code']))]
    df['spa_count'] = ['not-predicted' for i in range(0, len(df['lan_code']))]
    df['por_count'] = ['not-predicted' for i in range(0, len(df['lan_code']))]

    for index, data in df.iterrows():
        lan = data['lan_code']
        sentence = (data['sentence'])
        count_spa = 0
        zero_por = 0
        zero_spa = 0

        count_por = 0
        for i in range(len(sentence) - n + 1):
            ngram = sentence[i:i + n]
            if ngram not in ngrams_spa:
                zero_por+=1
                count_por += 1
            elif ngram not in ngrams_por:
                zero_spa+=1
                count_spa += 1
            elif ngrams_spa[ngram] < ngrams_por[ngram]:
                count_por += 1
            else:
                count_spa += 1
        if zero_por > 0:
            df.at[index, 'predicted_lang'] = 'por'
            df.at[index, 'spa_count'] = count_spa
            df.at[index, 'por_count'] = count_por

        elif zero_spa > 0:
            
            df.at[index, 'predicted_lang'] = 'spa'
            df.at[index, 'spa_count'] = count_spa
            df.at[index, 'por_count'] = count_por
        elif count_spa >= count_por:
            df.at[index, 'predicted_lang'] = 'spa'
            df.at[index, 'spa_count'] = count_spa
            df.at[index, 'por_count'] = count_por

        else:
            df.at[index, 'predicted_lang'] = 'por'
            df.at[index, 'spa_count'] = count_spa
            df.at[index, 'por_count'] = count_por


    return df


In [12]:


for i in range(1, 5):
    spclassified_df = classify_zero(test_df, i, spaish_ngrams_normalised[i], czech_ngrams[i])
    spclassified_df.to_csv(f'spclassified_using_{i}_gram_zero_handled.csv', index=False)
    

In [13]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score
result = [0,0,0,0,0]
precision_scores = [0,0,0,0,0]
for i in range(1,5):
    spclassified_df = pd.read_csv(f'spclassified_using_{i}_gram_zero_handled.csv')

    true = (list(spclassified_df['lan_code']))
    pred = (list(spclassified_df['predicted_lang']))


    print(f"{i}-gram &",precision_score(true, pred, average='weighted'), end=" & ")
    print(recall_score(true, pred, average='weighted'),end=" \\\\ \n")


1-gram & 0.6987838597721998 & 0.6986599294151353 \\ 
2-gram & 0.8931312694687771 & 0.8916418143696199 \\ 
3-gram & 0.9653095864938794 & 0.9653066274239129 \\ 
4-gram & 0.9732172228147676 & 0.9724041829997265 \\ 
