In [None]:
!pip install indic-nlp-library

In [12]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
from indicnlp.normalize.indic_normalize import DevanagariNormalizer

In [13]:
# read csv

column_names = ['pred', 'actual']
df = pd.read_csv('hindi_data.csv', header=None, names=column_names)
df.head()

Unnamed: 0,pred,actual
0,अनाथों,अनाथों
1,इज्ज्त,इज्ज्त
2,देखना,देखना
3,मृतका,मृतका
4,ऊर्वगामी,ऊर्ध्वगामी


In [14]:
# get wrong preds
wrong_df = df.loc[df['pred'] != df['actual']]
wrong_df.head()

Unnamed: 0,pred,actual
4,ऊर्वगामी,ऊर्ध्वगामी
8,उबालों,उबालें
12,पढ़ना,पढ़ना
27,भ्याराश्ताचार,भ्याराश्ताचारियो
31,निष्प्रण,निमन्त्रण


In [15]:
## Get unicode-escape representation

# Convert pred and actual columns to unicode escape characters
wrong_df['pred_esc'] = wrong_df['pred'].apply(lambda x: x.encode('unicode_escape').decode('utf-8'))
wrong_df['actual_esc'] = wrong_df['actual'].apply(lambda x: x.encode('unicode_escape').decode('utf-8'))

# Display the new DataFrame
wrong_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wrong_df['pred_esc'] = wrong_df['pred'].apply(lambda x: x.encode('unicode_escape').decode('utf-8'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wrong_df['actual_esc'] = wrong_df['actual'].apply(lambda x: x.encode('unicode_escape').decode('utf-8'))


Unnamed: 0,pred,actual,pred_esc,actual_esc
4,ऊर्वगामी,ऊर्ध्वगामी,\u090a\u0930\u094d\u0935\u0917\u093e\u092e\u0940,\u090a\u0930\u094d\u0927\u094d\u0935\u0917\u093e\u092e\u0940
8,उबालों,उबालें,\u0909\u092c\u093e\u0932\u094b\u0902,\u0909\u092c\u093e\u0932\u0947\u0902
12,पढ़ना,पढ़ना,\u092a\u0922\u093c\u0928\u093e,\u092a\u095d\u0928\u093e
27,भ्याराश्ताचार,भ्याराश्ताचारियो,\u092d\u094d\u092f\u093e\u0930\u093e\u0936\u094d\u0924\u093e\u091a\u093e\u0930,\u092d\u094d\u092f\u093e\u0930\u093e\u0936\u094d\u0924\u093e\u091a\u093e\u0930\u093f\u092f\u094b
31,निष्प्रण,निमन्त्रण,\u0928\u093f\u0937\u094d\u092a\u094d\u0930\u0923,\u0928\u093f\u092e\u0928\u094d\u0924\u094d\u0930\u0923


In [16]:
## testing

normalizer = DevanagariNormalizer(lang="hi", remove_nuktas=True)

sample_text_literal = wrong_df['pred_esc'].iloc[2]

# Convert the string literal to Unicode characters
sample_text = sample_text_literal.encode('utf-8').decode('unicode-escape')

print(f"Before normalization: {sample_text}")
print(f"After normalization: {normalizer.normalize(sample_text)}")

Before normalization: पढ़ना
After normalization: पढना


In [17]:
# Normalize
normalizer = DevanagariNormalizer(lang="hi", remove_nuktas=True)

wrong_df['pred_esc_normalized'] = wrong_df['pred_esc'].apply(lambda x: x.encode('utf-8').decode('unicode-escape'))
wrong_df['actual_esc_normalized'] = wrong_df['actual_esc'].apply(lambda x: x.encode('utf-8').decode('unicode-escape'))

# Now, apply the normalizer to the normalized columns
wrong_df['pred_esc_normalized'] = wrong_df['pred_esc_normalized'].apply(lambda x: normalizer.normalize(x))
wrong_df['actual_esc_normalized'] = wrong_df['actual_esc_normalized'].apply(lambda x: normalizer.normalize(x))


# Create a new column for the length
wrong_df['pred_esc_length'] = wrong_df['pred_esc'].apply(lambda x: len(x))
wrong_df['actual_esc_length'] = wrong_df['actual_esc'].apply(lambda x: len(x))
wrong_df['pred_esc_normalized_length'] = wrong_df['pred_esc_normalized'].apply(lambda x: len(x))
wrong_df['actual_esc_normalized_length'] = wrong_df['actual_esc_normalized'].apply(lambda x: len(x))

# Display the updated DataFrame
wrong_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wrong_df['pred_esc_normalized'] = wrong_df['pred_esc'].apply(lambda x: x.encode('utf-8').decode('unicode-escape'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wrong_df['actual_esc_normalized'] = wrong_df['actual_esc'].apply(lambda x: x.encode('utf-8').decode('unicode-escape'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu

Unnamed: 0,pred,actual,pred_esc,actual_esc,pred_esc_normalized,actual_esc_normalized,pred_esc_length,actual_esc_length,pred_esc_normalized_length,actual_esc_normalized_length
4,ऊर्वगामी,ऊर्ध्वगामी,\u090a\u0930\u094d\u0935\u0917\u093e\u092e\u0940,\u090a\u0930\u094d\u0927\u094d\u0935\u0917\u093e\u092e\u0940,ऊर्वगामी,ऊर्ध्वगामी,48,60,8,10
8,उबालों,उबालें,\u0909\u092c\u093e\u0932\u094b\u0902,\u0909\u092c\u093e\u0932\u0947\u0902,उबालों,उबालें,36,36,6,6
12,पढ़ना,पढ़ना,\u092a\u0922\u093c\u0928\u093e,\u092a\u095d\u0928\u093e,पढना,पढना,30,24,4,4
27,भ्याराश्ताचार,भ्याराश्ताचारियो,\u092d\u094d\u092f\u093e\u0930\u093e\u0936\u094d\u0924\u093e\u091a\u093e\u0930,\u092d\u094d\u092f\u093e\u0930\u093e\u0936\u094d\u0924\u093e\u091a\u093e\u0930\u093f\u092f\u094b,भ्याराश्ताचार,भ्याराश्ताचारियो,78,96,13,16
31,निष्प्रण,निमन्त्रण,\u0928\u093f\u0937\u094d\u092a\u094d\u0930\u0923,\u0928\u093f\u092e\u0928\u094d\u0924\u094d\u0930\u0923,निष्प्रण,निमन्त्रण,48,54,8,9
33,అర్ధ,अर्ध,\u0c05\u0c30\u0c4d\u0c27,\u0905\u0930\u094d\u0927,అర్ధ,अर्ध,24,24,4,4
35,एकसक्लूसिव,एक्सक्लूसिव,\u090f\u0915\u0938\u0915\u094d\u0932\u0942\u0938\u093f\u0935,\u090f\u0915\u094d\u0938\u0915\u094d\u0932\u0942\u0938\u093f\u0935,एकसक्लूसिव,एक्सक्लूसिव,60,66,10,11
40,वीज्ञान,विज्ञान,\u0935\u0940\u091c\u094d\u091e\u093e\u0928,\u0935\u093f\u091c\u094d\u091e\u093e\u0928,वीज्ञान,विज्ञान,42,42,7,7
43,घडियाल,घडि़याल,\u0918\u0921\u093f\u092f\u093e\u0932,\u0918\u0921\u093f\u093c\u092f\u093e\u0932,घडियाल,घडियाल,36,42,6,6
47,पदेगा।,पढ़ेगा।,\u092a\u0926\u0947\u0917\u093e\u0964,\u092a\u095d\u0947\u0917\u093e\u0964,पदेगा।,पढेगा।,36,36,6,6


In [18]:
# Filter rows where 'pred_esc_normalized' is equal to 'actual_esc_normalized'
filtered_df = wrong_df[wrong_df['pred_esc_normalized'] == wrong_df['actual_esc_normalized']]

In [19]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 231 entries, 12 to 12787
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   pred                          231 non-null    object
 1   actual                        231 non-null    object
 2   pred_esc                      231 non-null    object
 3   actual_esc                    231 non-null    object
 4   pred_esc_normalized           231 non-null    object
 5   actual_esc_normalized         231 non-null    object
 6   pred_esc_length               231 non-null    int64 
 7   actual_esc_length             231 non-null    int64 
 8   pred_esc_normalized_length    231 non-null    int64 
 9   actual_esc_normalized_length  231 non-null    int64 
dtypes: int64(4), object(6)
memory usage: 19.9+ KB


In [None]:
# from google.colab import files

# # Save the DataFrame to a CSV file
# wrong_df.to_csv('wrong_df_hindi.csv', index=False)

# # Download the CSV file
# files.download('wrong_df_hindi.csv')