# Check N1 csv, remove duplicates, remove guide column

In [1]:
import pandas as pd

In [2]:
n1_data = pd.read_csv("n1.csv")
n1 = pd.DataFrame(n1_data)

n1.head()

Unnamed: 0,expression,reading,meaning,tags,guid
0,現像,げんぞう,developing (film),JLPT_1 JLPT,v9bha+<cj}
1,原則,げんそく,"principle, general rule",JLPT_1 JLPT,dVC[`t${#`
2,見地,けんち,point of view,JLPT_1 JLPT,wk)a!>hpi:
3,現地,げんち,"actual place, local",JLPT_1 JLPT,G$>SwAUY5:
4,限定,げんてい,"limit, restriction",JLPT_1 JLPT,o%F/8(![P3


In [4]:
n1.describe()

Unnamed: 0,expression,reading,meaning,tags,guid
count,2699,2699,2699,2699,2699
unique,2685,2626,2652,25,2699
top,早急,たえる,business,JLPT_1 JLPT,v9bha+<cj}
freq,2,4,2,2585,1


In [26]:
n1[n1.duplicated(subset=["expression"])].count()

expression    14
reading       14
meaning       14
tags          14
guid          14
dtype: int64

In [25]:
n1[n1.expression.str.contains('主')]

Unnamed: 0,expression,reading,meaning,tags,guid
477,主,ぬし,"owner, master, god",JLPT_1 JLPT Intermediate_Japanese_Ln.13 Interm...,f$iPQS#{]6
1089,君主,くんしゅ,"ruler, monarch",JLPT_1 JLPT,stPJuiua;x
1172,自主,じしゅ,"independence, autonomy",JLPT_1 JLPT,ARYI~qJT^l
1299,主導,しゅどう,main leadership,JLPT_1 JLPT,"d-6;%xC,V("
1300,主任,しゅにん,"person in charge, responsible official",JLPT_1 JLPT Intermediate_Japanese_Ln.8 Interme...,P`pi2j!/Zn
1955,地主,じぬし,landlord,JLPT_1 JLPT,P7lP?kfIYU
1982,主,しゅ,"owner, master, god",JLPT_1 JLPT,tZY2oe0@.s
2009,主演,しゅえん,"starring, playing the leading part",JLPT_1 JLPT,n]D<c|Caow
2010,主観,しゅかん,"subjectivity, subject, ego",JLPT_1 JLPT,lW5DbL!{A5
2016,主権,しゅけん,sovereignty,JLPT_1 JLPT,bc]8:XVC+v


In [27]:
clean_n1 = n1.drop_duplicates(subset=['expression'])
clean_n1.count()

expression    2685
reading       2685
meaning       2685
tags          2685
guid          2685
dtype: int64

In [28]:
clean_n1.columns

Index(['expression', 'reading', 'meaning', 'tags', 'guid'], dtype='object')

In [29]:
clean_n1 = clean_n1[['expression', 'reading', 'meaning', 'tags']]
clean_n1.columns

Index(['expression', 'reading', 'meaning', 'tags'], dtype='object')

In [30]:
clean_n1.head()

Unnamed: 0,expression,reading,meaning,tags
0,現像,げんぞう,developing (film),JLPT_1 JLPT
1,原則,げんそく,"principle, general rule",JLPT_1 JLPT
2,見地,けんち,point of view,JLPT_1 JLPT
3,現地,げんち,"actual place, local",JLPT_1 JLPT
4,限定,げんてい,"limit, restriction",JLPT_1 JLPT


In [32]:
clean_n1.tags = 'n1'
clean_n1.head()

Unnamed: 0,expression,reading,meaning,tags
0,現像,げんぞう,developing (film),n1
1,原則,げんそく,"principle, general rule",n1
2,見地,けんち,point of view,n1
3,現地,げんち,"actual place, local",n1
4,限定,げんてい,"limit, restriction",n1


# n1 level check test
- randomly select 10 rows
- show kanji and ask for user input
- check if input is in meaning column
- score +1 if correct
- same for 9 more kanji
- gives user score


In [48]:
import random

# choose 10 rows randomly
test = clean_n1.sample(n=10)
test

Unnamed: 0,expression,reading,meaning,tags
1548,装備,そうび,equipment,n1
1323,昇進,しょうしん,promotion,n1
1773,尽きる,つきる,"to be used up, to be run out",n1
1028,郷里,きょうり,"birth-place, home town",n1
391,副,とりわけ,"especially, above all",n1
761,解除,かいじょ,"cancellation, release, cancel",n1
933,効き目,ききめ,"effect, virtue, efficacy",n1
605,打ち込む,うちこむ,"to devote oneself to, to shoot into",n1
2539,余程,よほど,"very, much, to a large extent, quite",n1
1706,知的,ちてき,intellectual,n1


In [43]:
score = 0
for index, row in test.iterrows():
    answer = input(f"{row['expression']} (type 'exit' to quit): ")
    
    # Check if the user wants to exit
    if answer.lower() == 'exit':
        break
    if answer in row['meaning']:
        print("Correct!")
        score += 1
    else:
        print(f"{row['expression']} means {row['meaning']}")
print(f"You got {score}/10 correct")

開発 means development, exploitation
手元 means (money) on hand or at home, one's purse; usual skill
～なんか means in the least ~
導入 means introduction, bringing in, leading in
Correct!
Correct!
Correct!
復旧 means restoration, restitution, rehabilitation
Correct!
Correct!
5


# Using NLP and word similarity to allow flexibility from user answers

In [44]:
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

In [46]:
# Define a function to compute WordNet-based similarity
def wordnet_similarity(user_input, correct_answer):
    user_tokens = word_tokenize(user_input)
    answer_tokens = word_tokenize(correct_answer)

    # Initialize a list to store individual word similarities
    word_similarities = []

    for user_word in user_tokens:
        max_similarity = 0  # Initialize max similarity for the current user word

        for answer_word in answer_tokens:
            user_synsets = wordnet.synsets(user_word)
            answer_synsets = wordnet.synsets(answer_word)

            if user_synsets and answer_synsets:
                similarity = max(
                    s1.wup_similarity(s2) for s1 in user_synsets for s2 in answer_synsets
                )

                if similarity > max_similarity:
                    max_similarity = similarity

        word_similarities.append(max_similarity)

    # Calculate the average similarity across all user words
    if word_similarities:
        average_similarity = sum(word_similarities) / len(word_similarities)
        return average_similarity
    else:
        return 0


In [52]:
score = 0

for index, row in test.iterrows():
    answer = input(f"{row['expression']} (type 'exit' to quit): ")

    # Check if the user wants to exit
    if answer.lower() == 'exit':
        break

    # Compute WordNet-based similarity between user input and correct answer
    similarity = wordnet_similarity(answer, row['meaning'])

    if similarity > 0.8:  # Adjust the threshold as needed
        print(f"{answer} is correct. {row['expression']} means {row['meaning']}")
        score += 1
    else:
        print(f"{answer} is incorrect. {row['expression']} means {row['meaning']}")
print(f"You got {score}/10 correct")

apparatus is correct. 装備 means equipment
promotion is correct. 昇進 means promotion
to run out of is incorrect. 尽きる means to be used up, to be run out
birthplace is correct. 郷里 means birth-place, home town
above all is correct. 副 means especially, above all
cancel is correct. 解除 means cancellation, release, cancel
effect is correct. 効き目 means effect, virtue, efficacy
idk is incorrect. 打ち込む means to devote oneself to, to shoot into
extra is incorrect. 余程 means very, much, to a large extent, quite
academic is incorrect. 知的 means intellectual
You got 6/10 correct
