In [1]:
# analysis.ipynb cell (Final version)
import pandas as pd
import sys
import os

# Add the project's root directory to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from lib.data_loader import populate_database
from lib.analysis import find_all_false_friends
from lib.config import DB_CONFIG

# Set pandas display options
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.width', 1000)

print("Setup complete. Ready to load data.")




Setup complete. Ready to load data.


In [2]:
populate_database()

print(f"\nDatabase population complete. Data is stored in '{DB_CONFIG['dbname']}'.")


Initializing database 'lang_db'...
Database initialized successfully.

--- Processing: Japanese ---
File 'JMdict_e' already exists, skipping download...
Inserted 1832 entries for Japanese in 4.17 seconds.

--- Processing: Chinese ---
File 'cedict_1_0_ts_utf-8_mdbg.txt' already exists, skipping download...
Inserted 1010 entries for Chinese in 1.30 seconds.

Database population complete. Data is stored in 'lang_db'.


In [3]:
# --- Parameters for Analysis ---
BASE_LANGUAGE = 'Japanese'
TARGET_LANGUAGE = 'Chinese'
SIMILARITY_THRESHOLD = 0.1 # Lower value = more results

# Find false friends by querying the database
results_df = find_all_false_friends(
    base_language=BASE_LANGUAGE,
    target_language=TARGET_LANGUAGE,
    similarity_threshold=SIMILARITY_THRESHOLD
)

# Display results
if not results_df.empty:
    # Reorder columns for clarity and display the final DataFrame
    display(results_df[['Word', 'Similarity', f'{BASE_LANGUAGE} Definition', f'{TARGET_LANGUAGE} Definition']].sort_values(by='Similarity'))
else:
    print("No results found for the given parameters.")



Finding false friends between 'Japanese' and 'Chinese'...
Found 14364 common words to analyze.
Calculating semantic similarity using spaCy (this may take a moment)...


  df = pd.read_sql_query(query, conn, params=params)
  return doc1.similarity(doc2)


Analysis complete. Found 13575 potential false friends.


Unnamed: 0,Word,Similarity,Japanese Definition,Chinese Definition
5715,刀子,-0.226191,small knife that is largely ornamental or used to cut open letters and the like,knife; CL:把[ba3]
4052,疑似,-0.218098,"pseudo; quasi; false; para-; mock; sham; suspected (case, e.g. of disease)",to be suspected to be
10578,唯,-0.149372,ordinary; common; usual; free of charge; unaffected; as is; safe; only; merely; just; simply; but; however; nevertheless,yes
11364,斑斑,-0.133899,irresistibly; suddenly; to be turned on; to be horny,full of stains or spots
13391,色子,-0.123282,kabuki actor who is also a male prostitute,dice (used in gambling)
...,...,...,...,...
11230,世,0.978963,world; society; public; life; lifetime; age; era; period; epoch; generation; reign; rule; the times; world (of existence),life; age; generation; era; world; lifetime; epoch; descendant; noble
975,公国,0.979886,dukedom; duchy; principality,duchy; dukedom; principality
7310,水利部,0.980147,Ministry of Water Resources (China),Ministry of Water Resources (PRC)
7747,各国,0.981416,each country; every country; various countries; all countries,each country; every country; various countries
