In [1]:
# analysis.ipynb cell (Final version)
import pandas as pd
import sys
import os

# Add the project's root directory to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from lib.data_loader import populate_database
from lib.analysis import find_all_false_friends
from lib.config import DB_CONFIG

# Set pandas display options
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.width', 1000)

print("Setup complete. Ready to load data.")




Setup complete. Ready to load data.


In [2]:
populate_database()

print(f"\nDatabase population complete. Data is stored in '{DB_CONFIG['dbname']}'.")


Initializing database 'lang_db'...
Database initialized successfully.
Downloading to 'jp_freq.txt'...
Loaded 120488 frequency ranks for Japanese.
Downloading to 'zh_freq.txt'...
Loaded 50000 frequency ranks for Chinese.

--- Processing: Japanese ---
File 'JMdict_e' already exists, skipping download...
Inserted 1641 entries for Japanese in 4.52 seconds.

--- Processing: Chinese ---
File 'cedict_1_0_ts_utf-8_mdbg.txt' already exists, skipping download...
Inserted 1104 entries for Chinese in 1.45 seconds.

Database population complete. Data is stored in 'lang_db'.


In [3]:
# --- Parameters for Analysis ---
BASE_LANGUAGE = 'Japanese'
TARGET_LANGUAGE = 'Chinese'
SIMILARITY_THRESHOLD = 0.4 # Adjusted threshold for better results
USE_FREQUENCY_FILTER = True # Enable frequency-based filtering
FREQUENCY_THRESHOLD = 10000 # Only include words in the top 10,000 most common

# Find false friends by querying the database
results_df = find_all_false_friends(
    base_language=BASE_LANGUAGE,
    target_language=TARGET_LANGUAGE,
    similarity_threshold=SIMILARITY_THRESHOLD,
    use_frequency_filter=USE_FREQUENCY_FILTER,
    freq_rank_threshold=FREQUENCY_THRESHOLD
)

# Display results
if not results_df.empty:
    # Reorder columns for clarity and display the final DataFrame
    display(results_df[['Word', 'Similarity', f'{BASE_LANGUAGE} Definition', f'{TARGET_LANGUAGE} Definition']].sort_values(by='Similarity'))
else:
    print("No results found for the given parameters.")



Finding false friends between 'Japanese' and 'Chinese'...
Found 867 common words to analyze.
Calculating semantic similarity using spaCy (this may take a moment)...


  df = pd.read_sql_query(query, conn, params=params)


Analysis complete. Found 22 potential false friends.


Unnamed: 0,Word,Similarity,Japanese Definition,Chinese Definition
125,机,0.009854,desk,(bound form) machine; mechanism; (bound form) aircraft; (bound form) an opportunity; (bound form) crucial point; pivot; (bound form) quick-witted; flexible; (bound form) organic
823,方正,0.127143,rectitude,"Fangzheng county in Harbin 哈爾濱|哈尔滨[Ha1 er3 bin1], Heilongjiang; clear and square; neat; square (person)"
138,救命,0.165618,lifesaving,to save sb's life; (interj.) Help!; Save me!
770,丈夫,0.175984,healthy; robust; strong; solid; durable; hero; manly person; warrior,husband
461,大使,0.280185,ambassador,"ambassador; envoy; CL:名[ming2],位[wei4]"
68,家具,0.287057,furniture,"furniture; CL:件[jian4],套[tao4]"
6,浪漫,0.29065,romance (e.g. Arthurian romances); heroic tale; (nigh) impossible dream; adventurous spirit; great undertaking; epic adventure; (full-length) novel; romance; love affair; romanticism,romantic
711,有罪,0.306252,guilt; culpability,guilty
334,招待,0.306633,invitation,to hold a reception; to offer hospitality; to entertain (guests); to serve (customers)
837,雅,0.308315,refinement; elegance; grace; festal song (genre of the Shi Jing),elegant
