In [1]:
import pandas as pd

train_df = pd.read_csv('polish_czech_train.csv')
test_df = pd.read_csv('polish_czech_test.csv')


def ngrams(df, n):
    ngram_counts = {'pol': {}, 'ces': {}}
    for i, data in df.iterrows():
        lan = data['lan_code']
        sentence = (data['sentence'])
        for i in range(len(sentence) - n + 1):
            ngram = sentence[i:i + n]

            if ngram in ngram_counts[lan]:
                ngram_counts[lan][ngram] += 1

            else:
                ngram_counts[lan][ngram] = 1
    return ngram_counts['pol'], ngram_counts['ces']


In [2]:

def classify(df, n, ngrams_pol, ngrams_ces):
    df['predicted_lang'] = ['not-predicted' for i in range(0, len(df['lan_code']))]
    df['pol_count'] = ['not-predicted' for i in range(0, len(df['lan_code']))]
    df['ces_count'] = ['not-predicted' for i in range(0, len(df['lan_code']))]

    for index, data in df.iterrows():
        lan = data['lan_code']
        sentence = (data['sentence'])
        count_pol = 0
        zero_ces = 0
        zero_pol = 0

        count_ces = 0
        for i in range(len(sentence) - n + 1):
            ngram = sentence[i:i + n]
            if ngram not in ngrams_pol:
                count_ces += 1
            elif ngram not in ngrams_ces:
                count_pol += 1
            elif ngrams_pol[ngram] < ngrams_ces[ngram]:
                count_ces += 1
            else:
                count_pol += 1
        if count_pol >= count_ces:
            df.at[index, 'predicted_lang'] = 'pol'
            df.at[index, 'pol_count'] = count_pol
            df.at[index, 'ces_count'] = count_ces

        else:
            df.at[index, 'predicted_lang'] = 'ces'
            df.at[index, 'pol_count'] = count_pol
            df.at[index, 'ces_count'] = count_ces


    return df


In [3]:
import json
polish_ngrams, czech_ngrams = [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]
for i in range(1, 5):
    polish_ngrams[i], czech_ngrams[i] = ngrams(train_df, i)


# Convert the dictionary to a JSON string
    ngramsp = json.dumps(polish_ngrams[i])
    ngramsc = json.dumps(czech_ngrams[i])


    # You can also save the JSON to a file
    with open(f'polish_{i}grams.json', 'w') as json_file:
        json.dump(ngramsp, json_file)
    with open(f'czech_{i}grams.json', 'w') as json_file:
        json.dump(ngramsc, json_file)


In [4]:

for i in range(1, 5):
    classified_df = classify(test_df, i, polish_ngrams[i], czech_ngrams[i])
    classified_df.to_csv(f'classified_without_normalisation_using_{i}_gram.csv', index=False)

In [5]:
count_pol = (train_df['lan_code'] == 'pol').sum()
count_ces = (train_df['lan_code'] == 'ces').sum()
print(count_ces, count_pol)

50932 93437


In [6]:
for i in range(1,5):
    total_sump = sum(polish_ngrams[i].values())
    total_sumc= sum(czech_ngrams[i].values())

    print("The total ngrams is:", total_sump, total_sumc)


The total ngrams is: 3084568 1430566
The total ngrams is: 2991131 1379634
The total ngrams is: 2897694 1328702
The total ngrams is: 2804257 1277770


In [7]:
polish_ngrams_normalised = [0, 0, 0, 0, 0]
for i in range(1,5):
    polish_ngrams_normalised[i] = polish_ngrams[i].copy()
    sum_dict1 = sum(polish_ngrams[i].values())
    sum_dict2 = sum(czech_ngrams[i].values())

    scaling_factor = sum_dict1 / sum_dict2

    for key in polish_ngrams[i]:
        polish_ngrams_normalised[i][key] /= scaling_factor



In [8]:


for i in range(1, 5):
    classified_df = classify(test_df, i, polish_ngrams_normalised[i], czech_ngrams[i])
    classified_df.to_csv(f'classified_using_{i}_gram.csv', index=False)
    

In [9]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score
result = [0,0,0,0,0]
precision_scores = [0,0,0,0,0]
for i in range(1,5):
    classified_df = pd.read_csv(f'classified_without_normalisation_using_{i}_gram.csv')

    true = (list(classified_df['lan_code']))
    pred = (list(classified_df['predicted_lang']))


    print(f"{i}gram &",precision_score(true, pred, average='weighted'), end=" & ")
    print(recall_score(true, pred, average='weighted'),end=" \\\\ \n")
    


1gram & 0.7700569739789548 & 0.6415094339622641 \\ 
2gram & 0.7998873629680612 & 0.7091125703044912 \\ 
3gram & 0.9474497866365013 & 0.9427866899398776 \\ 
4gram & 0.987616706384878 & 0.9874213836477987 \\ 


In [10]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score
result = [0,0,0,0,0]
precision_scores = [0,0,0,0,0]
precision_pol = [0,0,0,0,0]
recall_pol = [0,0,0,0,0]
precision_ces = [0,0,0,0,0]
recall_ces = [0,0,0,0,0]
for i in range(1,5):
    classified_df = pd.read_csv(f'classified_using_{i}_gram.csv')

    true = (list(classified_df['lan_code']))
    pred = (list(classified_df['predicted_lang']))
    # TP = len(classified_df[(classified_df['lan_code'] == 'pol') & (classified_df['predicted_lang'] == 'pol')])
    # TN = len(classified_df[(classified_df['lan_code'] != 'pol') & (classified_df['predicted_lang'] != 'pol')])
    # FP = len(classified_df[(classified_df['lan_code'] != 'pol') & (classified_df['predicted_lang'] == 'pol')])
    # FN = len(classified_df[(classified_df['lan_code'] == 'pol') & (classified_df['predicted_lang'] != 'pol')])

    # # Calculate precision and recall
    # precision_pol[i] = TP / (TP + FP) 
    # recall_pol[i] = TP / (TP + FN) 

    # print(TP,FP, TN,FN, precision_pol[i], recall_pol[i])
    # TP = len(classified_df[(classified_df['lan_code'] == 'ces') & (classified_df['predicted_lang'] == 'ces')])
    # TN = len(classified_df[(classified_df['lan_code'] != 'ces') & (classified_df['predicted_lang'] != 'ces')])
    # FP = len(classified_df[(classified_df['lan_code'] != 'ces') & (classified_df['predicted_lang'] == 'ces')])
    # FN = len(classified_df[(classified_df['lan_code'] == 'ces') & (classified_df['predicted_lang'] != 'ces')])

    # precision_ces[i] = TP / (TP + FP) 
    # recall_ces[i] = TP / (TP + FN) 
    # print((TP+TN)/(TP+TN+FP+FN))
    print(f"{i}gram &",precision_score(true, pred, average='weighted'), end=" & ")
    print(recall_score(true, pred, average='weighted'),end=" \\\\ \n")

    result[i] = classified_df[(classified_df['lan_code'] != classified_df['predicted_lang'])]
    # compare(result[i],i, polish_ngrams_normalised[i], czech_ngrams[i]
    # print(result[i]['sentence'])


1gram & 0.7959109657652166 & 0.5372509904967723 \\ 
2gram & 0.9794780573595491 & 0.9793865846563046 \\ 
3gram & 0.9925201476894107 & 0.9925193250768847 \\ 
4gram & 0.996046220186964 & 0.9960380129110908 \\ 


In [11]:

def classify_zero(df, n, ngrams_pol, ngrams_ces):
    df['predicted_lang'] = ['not-predicted' for i in range(0, len(df['lan_code']))]
    df['pol_count'] = ['not-predicted' for i in range(0, len(df['lan_code']))]
    df['ces_count'] = ['not-predicted' for i in range(0, len(df['lan_code']))]

    for index, data in df.iterrows():
        lan = data['lan_code']
        sentence = (data['sentence'])
        count_pol = 0
        zero_ces = 0
        zero_pol = 0

        count_ces = 0
        for i in range(len(sentence) - n + 1):
            ngram = sentence[i:i + n]
            if ngram not in ngrams_pol:
                zero_ces+=1
                count_ces += 1
            elif ngram not in ngrams_ces:
                zero_pol+=1
                count_pol += 1
            elif ngrams_pol[ngram] < ngrams_ces[ngram]:
                count_ces += 1
            else:
                count_pol += 1
        if zero_ces > 0:
            df.at[index, 'predicted_lang'] = 'ces'
            df.at[index, 'pol_count'] = count_pol
            df.at[index, 'ces_count'] = count_ces

        elif zero_pol > 0:
            
            df.at[index, 'predicted_lang'] = 'pol'
            df.at[index, 'pol_count'] = count_pol
            df.at[index, 'ces_count'] = count_ces
        elif count_pol >= count_ces:
            df.at[index, 'predicted_lang'] = 'pol'
            df.at[index, 'pol_count'] = count_pol
            df.at[index, 'ces_count'] = count_ces

        else:
            df.at[index, 'predicted_lang'] = 'ces'
            df.at[index, 'pol_count'] = count_pol
            df.at[index, 'ces_count'] = count_ces


    return df


In [12]:


for i in range(1, 5):
    classified_df = classify_zero(test_df, i, polish_ngrams_normalised[i], czech_ngrams[i])
    classified_df.to_csv(f'classified_using_{i}_gram_zero_handled.csv', index=False)
    

In [13]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score
result = [0,0,0,0,0]
precision_scores = [0,0,0,0,0]
for i in range(1,5):
    classified_df = pd.read_csv(f'classified_using_{i}_gram_zero_handled.csv')

    true = (list(classified_df['lan_code']))
    pred = (list(classified_df['predicted_lang']))


    print(f"{i}gram &",precision_score(true, pred, average='weighted'), end=" & ")
    print(recall_score(true, pred, average='weighted'),end=" \\\\ \n")



1gram & 0.9354931908409589 & 0.92170226913806 \\ 
2gram & 0.9888574712390337 & 0.9887512814119082 \\ 
3gram & 0.9801764721324643 & 0.9792757598426287 \\ 
4gram & 0.9362192278119122 & 0.922699692461142 \\ 
