In [6]:
import pandas as pd
import json

# Load the CSV file containing kanji with radicals
kanji_radicals_df = pd.read_csv('joyo_kanji_with_radicals.csv', usecols=['kanji', 'radical'])
print("kanji_radicals_df columns:", kanji_radicals_df.columns)

# Load the text file containing kanji frequency
kanji_freq_df = pd.read_csv('kanji_freq_report.txt', sep='\t', header=None, names=['frequency', 'kanji', 'col3', 'col4', 'col5', 'col6'])
kanji_freq_df = kanji_freq_df[['kanji', 'frequency']]
print("kanji_freq_df columns:", kanji_freq_df.columns)

# Load the JSON file containing kanji with JLPT levels
with open('kanji_output.json') as f:
    kanji_jlpt = json.load(f)

# Transform the JSON into a DataFrame
jlpt_data = []
for level, kanjis in kanji_jlpt.items():
    for kanji in kanjis:
        jlpt_data.append({'kanji': kanji, 'JLPT_level': int(level)})
kanji_jlpt_df = pd.DataFrame(jlpt_data)
print("kanji_jlpt_df columns:", kanji_jlpt_df.columns)

# Merge the dataframes on the 'kanji' column
merged_df = pd.merge(kanji_radicals_df, kanji_freq_df, on='kanji', how='left')
print("After first merge:", merged_df.columns)

merged_df = pd.merge(merged_df, kanji_jlpt_df, on='kanji', how='left')
print("After second merge:", merged_df.columns)

# Reorder the columns
merged_df = merged_df[['kanji', 'radical', 'frequency', 'JLPT_level']]

# Save the merged dataframe to a new CSV file
merged_df.to_csv('combined_kanji_data.csv', index=False)


kanji_radicals_df columns: Index(['kanji', 'radical'], dtype='object')
kanji_freq_df columns: Index(['kanji', 'frequency'], dtype='object')
kanji_jlpt_df columns: Index(['kanji', 'JLPT_level'], dtype='object')
After first merge: Index(['kanji', 'radical', 'frequency'], dtype='object')
After second merge: Index(['kanji', 'radical', 'frequency', 'JLPT_level'], dtype='object')
