In [21]:
cd ..

/home/hnakamura/nlp/nestedDependency


In [22]:
# -*- coding: utf-8 -*-
import spacy
import pandas as pd
import glob
import os
import re
from exJapaneseFeatures import JapaneseFeatureExtractor

from functools import lru_cache
from typing import Iterable, List, Tuple, Optional

Edge = Tuple[int, int]

# -----------------------
# ユーティリティ
# -----------------------
def _orientation(a: int, b: int) -> int:
    """右向き:+1, 左向き:-1（a==b は呼び出し元で排除）"""
    return 1 if a < b else -1

def _norm(a: int, b: int) -> Tuple[int, int]:
    """包含判定用に (L,R) 化（L<R）"""
    return (a, b) if a < b else (b, a)

def _strictly_contains_LR(L1: int, R1: int, L2: int, R2: int) -> bool:
    """交差・境界一致を排除した純粋な包含: L1 < L2 < R2 < R1"""
    return (L1 < L2) and (R2 < R1)

def _contains(outer: Edge, inner: Edge, same_direction_only: bool) -> bool:
    s1, t1 = outer; s2, t2 = inner
    if s1 == t1 or s2 == t2:
        return False  # 退化区間は含めない
    if same_direction_only and _orientation(s1, t1) != _orientation(s2, t2):
        return False
    L1, R1 = _norm(s1, t1); L2, R2 = _norm(s2, t2)
    return _strictly_contains_LR(L1, R1, L2, R2)

def _validate_edges(arrows: Iterable[Edge], *, max_edges: Optional[int] = None) -> List[Edge]:
    """軽い入力検証＋退化区間の除外＋重複の除去（順序保持）"""
    unique = []
    seen = set()
    try:
        for e in arrows:
            if not (isinstance(e, tuple) and len(e) == 2 and all(isinstance(x, int) for x in e)):
                raise TypeError(f"Edge must be tuple(int,int), got: {e}")
            if e[0] == e[1]:
                # 退化区間はスキップ（警告は出さない/必要なら print する）
                continue
            if e not in seen:
                unique.append(e); seen.add(e)
            if max_edges is not None and len(unique) > max_edges:
                # 防御的にサイズ上限（超えたら途中で返す）
                break
    except Exception as ex:
        raise ValueError(f"Invalid arrows input: {ex}") from ex
    return unique

def _extract_edges_from_doc(doc) -> List[Edge]:
    return [(tok.i, tok.head.i) for tok in doc if tok.head != tok]

def find_sentence_with_max_nesting_depth(
    sentences: List[str],
    nlp,
    same_direction_only: bool = True
):
    """
    与えられた文リストから、包摂関係の"最大深さ"が最大となる
    （文, 最大深さ, エッジ数）のタプルを返す。
    該当なしなら (None, 0, 0)。
    """
    best_sent = None
    best_depth = 0
    best_edges = 0

    for s in sentences:
        s = (s or "").strip()
        if not s:
            continue
        try:
            doc = nlp(s)
            arrows = _extract_edges_from_doc(doc)
            if not arrows:
                continue
            depth = max_interval_nesting_depth(arrows, same_direction_only=same_direction_only)
        except Exception as e:
            print(f"[error] max-depth calc failed: {e} :: {s[:40]}...")
            continue

        if depth > best_depth:
            best_depth = depth
            best_sent = s
            best_edges = len(arrows)

    return best_sent, best_depth, best_edges


"""
2023/08/15
https://note.com/npaka/n/n5c3e4ca67956
https://dev.classmethod.jp/articles/try-parsing-using-ginza/
https://megagonlabs.github.io/ginza/
を参照。
UNDERPIN fileに対してGINZA analysisを実行してデータフレームを出力

2024/01/11
GINZA解析はexJapaneseFeatures.pyから引っ張ってくるように変更
解析チェックの関数を追加。

2024/01/26
GINZA: max_bytes=49149
長文すぎるとGinzaは解析できない。
しかし、arrowsが短すぎると構文解析できずにarrows＝[]となる。
そのため、以下のようにした。

Patient_linesを用いて、1応答ずつ解析。
arrowsが短すぎる場合にはdepths = 0　とする。

2025/08/12
修正。
fpath変数の不適切な使用を修正。
ディレクトリエラーに対処。
"""

def analyze_sentence(sentence, nlp):
    # 短い文の場合は features を 0 で初期化
    if len(sentence.strip()) == 0:
        return {"max_nesting_relation": 0, "max_nesting_depth": 0}
    try:
        # 日本語モデルをロード, nlp
        # 解析を実行
        doc = nlp(sentence)

        # 対象とする係り受け関係のリスト
        #target_dependencies = ['nsubj', 'acl', "advcl", "obj"]
        
        # =======================
        # 両方向の辺を抽出（係り受けラベルの選定なし）
        # =======================
        # ※ 元の処理を最小修正：片方向制限とdepフィルタを撤廃
        arrows = [(tok.i, tok.head.i) for tok in doc if tok.head != tok]

        if not arrows:
            # 矢印リストが空の場合は、features を 0 で返す
            return {"max_nesting_relation": 0, "max_nesting_depth": 0}

        # 各矢印についてnesting_relationを計算
        nesting_relations = calculate_nesting_relations(arrows)
        # 最大のnesting_relationを求める
        max_nesting_relation = max(nesting_relations) if nesting_relations else 0
        # 最大の入れ子構造の深さを求める
        max_nesting_depth = max(calculate_nesting_depth(arrows, start, end) for start, end in arrows)

        features = {
            "max_nesting_relation": max_nesting_relation,
            "max_nesting_depth": max_nesting_depth
        }
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return {"max_nesting_relation": 0, "max_nesting_depth": 0}
    return features


def analyze_text_and_get_max_features(sentence_list, nlp):
    # 最大値を格納するための変数を初期化
    max_nesting_depth = 0
    max_nesting_relation = 0
    # 合計値を格納するための変数を初期化
    total_nesting_depth = 0

    # リスト内の各文に対してanalyze_sentenceを実行
    for sentence in sentence_list:
        # トークンの数が5以上のもののみ次に進む
        tokens = [token for token in nlp(sentence)]
        if len(tokens) >= 5:
            features = analyze_sentence(sentence, nlp)
            # 最大値を更新
            max_nesting_depth = max(max_nesting_depth, features["max_nesting_depth"])
            max_nesting_relation = max(max_nesting_relation, features["max_nesting_relation"])
            # 合計値を更新
            total_nesting_depth += features["max_nesting_depth"]

    features = {
            "max_nesting_relation": max_nesting_relation,
            "max_nesting_depth": max_nesting_depth,
            "total_nesting_depth": total_nesting_depth  # 合計値を辞書に追加
        }
    return features

# -----------------------
# 包摂カウント
# -----------------------
def calculate_nesting_relations(arrows: Iterable[Edge], same_direction_only: bool = True) -> List[int]:
    unique = _validate_edges(arrows)
    if not unique:
        return []

    counts_unique = [0] * len(unique)
    for i, cur in enumerate(unique):
        outers = 0; inners = 0
        for j, other in enumerate(unique):
            if j == i:
                continue
            if _contains(other, cur, same_direction_only):
                outers += 1
            if _contains(cur, other, same_direction_only):
                inners += 1
        counts_unique[i] = outers + inners

    idx = {e: k for k, e in enumerate(unique)}
    # ★ 入力 arrows の順序に対応して返す（修正点）
    return [counts_unique[idx[e]] for e in _validate_edges(arrows)]

# -----------------------
# 包摂の最長鎖（最大深さ）
# -----------------------
def calculate_nesting_depth(
    arrows: Iterable[Edge],
    start: int,
    end: int,
    depth: int = 0,  # 互換のため残置（未使用）
    same_direction_only: bool = True
) -> int:
    """
    起点エッジ (start,end) から純粋包含（必要なら向き一致）の内側だけを辿った
    "入れ子鎖の最長長"を返す。
    """
    unique = tuple(_validate_edges(arrows))
    if not unique:
        return 0
    if start == end:
        raise ValueError("start and end must differ (degenerate edge).")

    start_edge = (start, end)
    # 起点が実エッジに存在しない場合は警告（解析としては非推奨だが計算は続行）
    if start_edge not in unique:
        print(f"[warn] start edge {start_edge} not in arrows; proceeding anyway.")

    @lru_cache(maxsize=None)
    def _max_chain_from(s: int, t: int) -> int:
        # s,t の内側だけを次に辿る
        best = 0
        for (a, b) in unique:
            if _contains((s, t), (a, b), same_direction_only):
                cand = 1 + _max_chain_from(a, b)
                if cand > best:
                    best = cand
        return best

    return _max_chain_from(*start_edge)

def max_interval_nesting_depth(arrows: Iterable[Edge], same_direction_only: bool = True) -> int:
    """
    文内の全エッジを起点として探索し、包摂の"最大深さ"を返す。
    交差・境界一致は排除。方向一致は same_direction_only で切替。
    """
    unique = tuple(_validate_edges(arrows))
    if not unique:
        return 0

    @lru_cache(maxsize=None)
    def _max_chain_from(s: int, t: int) -> int:
        best = 0
        for (a, b) in unique:
            if _contains((s, t), (a, b), same_direction_only):
                cand = 1 + _max_chain_from(a, b)
                if cand > best:
                    best = cand
        return best

    best_overall = 0
    for (s, t) in unique:
        best_overall = max(best_overall, _max_chain_from(s, t))
    return best_overall

    

def readPatientPart(file):
    with open(file, mode='r', encoding='utf-8') as f:
        lines = f.readlines()
        lines_strip = [line.rstrip() for line in lines]  # 改行文字を除く
        line_num = [i for i in range(len(lines_strip))]
        p_line_num = [i for i, moji in enumerate(lines_strip) if '●患者' in moji]  # 「患者」を含む行番号を取得
        d_line_num = [i for i, moji in enumerate(lines_strip) if '●医師' in moji]
        target_num = p_line_num
        for i in range(len(p_line_num)):
            values = [e for e in d_line_num if e > p_line_num[i]]
            if values:
                if p_line_num[i] == (min(values) - 1):
                    continue
                else:
                    add_num = [j for j in range(len(lines_strip)) if (p_line_num[i] < j < min(values))]
                    target_num = target_num + add_num
            else:
                larger_num = [y for y in line_num if (y > p_line_num[i])]
                target_num = target_num + larger_num
        target_num = sorted(target_num)
        patient_lines = [s.replace('●患者', '') if s.startswith('●患者') else s for s in
                         [moji for i, moji in enumerate(lines_strip) if i in target_num]]
        patient_lines = [re.sub("\(.+?\)", "", s) if '(' in s else s for s in patient_lines]
        return patient_lines


def fileSortForAnalaysisNestedDependency(path, dataframe, feature_extractor, nlp):
    # ディレクトリを移動
    os.chdir(path)
    file = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.txt')]
    print('file:',file)
    for i in range(len(file)):
        target1 = '.txt'
        target2 = 'Kana.txt'
        if ((target1 in file[i]) == True) or ((target2 in file[i]) == True):
            try:
                catch_id = re.findall(r'U[A-Z]{2}[0-9]{3}.[1-5].[1-3]', file[i])
                subid = catch_id[0][0:6]
                subvisit = catch_id[0][7]
                subpart = catch_id[0][9]
                print('file:', file[i])
                try:
                    # 修正: fpathをfile[i]に変更
                    PATIENT_lines = readPatientPart(file[i])
                    filtered_patient_lines = apply_filter_to_patient_lines(PATIENT_lines, nlp)
                    # 空行除去
                    filtered_patient_lines = [s for s in filtered_patient_lines if (s or "").strip()]

                    # ★ 各サブジェクトで「最大深さの1文」をプリント
                    max_sent, max_depth, max_edges = find_sentence_with_max_nesting_depth(
                        filtered_patient_lines, nlp, same_direction_only=True  # 必要なら True に
                    )
                    if max_sent is not None:
                        print(f"[MAX_NESTING_DEPTH] ID={subid} Visit={subvisit} Part={subpart} "
                              f"depth={max_depth} edges={max_edges}\n  SENT: {max_sent}")

                    # 既存の集計
                    features = analyze_text_and_get_max_features(filtered_patient_lines, nlp)
                    features_dependency = analyze_dependency(filtered_patient_lines, nlp)

                    dic = {'ID': subid, 'Visit': subvisit, 'Part': subpart}
                    dic.update(**features, **features_dependency)
                    print(dic)

                    df = pd.DataFrame(data=dic, index=['val', ])
                    dataframe = pd.concat([df, dataframe], axis=0)

                except Exception as e:
                    # 修正: fpathをfile[i]に変更
                    print(f"[error] inner processing failed for {file[i]}: {e}")

            except Exception as e:
                # 修正: fpathをfile[i]に変更
                print(f"[error] outer loop failure at {file[i]}: {e}")
    return dataframe


def checkFileAnalyzedOrNot(directory_path, dataframe):
    # dataframeからID, Visit, Partを読み取り、file_name = "ID-Visit_Part"を作成
    analyzed_file_list = [f"{row['ID']}-{row['Visit']}_{row['Part']}" for index, row in dataframe.iterrows()]

    # 正規表現パターン
    pattern = r'U[A-Z]{2}[0-9]{3}.[1-5].[1-3]'

    # '.txt' 形式のファイルのパスを格納するリスト
    all_txt_file = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.txt')]

    # ディレクトリ内の全てのファイルのパスを格納するリスト
    all_path = [os.path.join(directory_path, f) for f in os.listdir(directory_path)]

    # all_pathから正規表現にマッチするID部分を抽出し、リスト化
    all_id_list = []
    for path in all_path:
        match = re.search(pattern, path)
        if match:
            all_id_list.append(match.group(0))

    # all_pathに含まれ、all_txt_fileに含まれない要素のみを持つリスト
    rest_path_txt_all = [f for f in all_path if f not in all_txt_file]

    # all_id_listに含まれ、analyzed_file_listに含まれない要素のみを持つリスト
    rest_path_analyzed_all = [id for id in all_id_list if id not in analyzed_file_list]

    print("TXT Files:", all_txt_file)
    print("txtファイル数:", len(all_txt_file))
    # print("All Files:", all_path)
    print("rest files(all-.txt):", rest_path_txt_all)
    print("実際に解析対象となったファイル数:", len(analyzed_file_list))
    print("rest files(all-analyzed):", rest_path_analyzed_all)

    return rest_path_analyzed_all

def filtered_sentence(sentence, nlp):
    doc = nlp(sentence)
    # フィラー、"INTJ"、 "reparandum" 依存関係のトークンを除外
    filtered_tokens = [token.text for token in doc if
                       token.tag_ != "感動詞-フィラー" and token.pos_ != "INTJ"]
    result = ''.join(filtered_tokens)
    # 読点を削除
    result = result.replace("、", "")
    result = result.replace("。", "")
    return result

# 既存の関数とコードの後に追加
def apply_filter_to_patient_lines(patient_lines, nlp):
    # filtered_sentence 関数を使って各行を処理
    filtered_lines = [filtered_sentence(line, nlp) for line in patient_lines]
    return filtered_lines

def analyze_dependency(patient_lines, nlp):
    max_tree_heights = []
    total_distances = []
    num_leaves_list = []
    num_nodes_list = []
    total_tree_height = 0  # 木の高さの合計を格納する変数を初期化

    for sentence in patient_lines:
        if len(sentence.strip()) == 0:
            continue  # 空の文はスキップ
        
        try:
            doc = nlp(sentence)
        except Exception as e:
            print(f"nlp処理中にエラーが発生しました: {e}")
            continue  # エラーが発生した文はスキップして次へ

        max_depth = 0
        total_distance = 0
        child_counts = {token.i: 0 for token in doc}

        try:
            for token in doc:
                depth = 0
                current = token
                while current.head != current:
                    depth += 1
                    current = current.head
                    total_distance += 1
                max_depth = max(max_depth, depth)
                child_counts[current.i] += 1

            num_leaves = sum(1 for count in child_counts.values() if count == 0)
        except Exception as e:
            print(f"依存関係の計算中にエラーが発生しました: {e}")
            continue  # エラーが発生した文はスキップして次へ
        
        max_tree_heights.append(max_depth)
        total_distances.append(total_distance)
        num_leaves_list.append(num_leaves)
        num_nodes_list.append(len(doc))
        total_tree_height += max_depth  # 各文の最大深さ（木の高さ）を合計に追加
    
    # 集計
    try:
        total_num_nodes = sum(num_nodes_list)
        max_tree_height = max(max_tree_heights) if max_tree_heights else 0
        total_num_leaves = sum(num_leaves_list)
        total_distance = sum(total_distances)
    except Exception as e:
        print(f"集計中にエラーが発生しました: {e}")
        return {}  # エラーが発生した場合は空の辞書を返す

    return {
        "max_tree_height": max_tree_height,
        "total_num_nodes": total_num_nodes,
        "total_num_leaves": total_num_leaves,
        "total_distance": total_distance,
        "total_tree_height": total_tree_height  # 木の高さの合計を辞書に追加
    }

if __name__ == '__main__':
    # JapaneseFeatureExtractor インスタンスを作成
    feature_extractor = JapaneseFeatureExtractor()
    #nlp = feature_extractor.nlp_speech #話し言葉用
    nlp = feature_extractor.nlp_speech_CEJCminus # CEJCminus
    #nlp = feature_extractor.nlp #書き言葉用
    # 例の使用法
    sentence = "私はああそうだニュースは今日が徳川家康の誕生日だと伝えたようだね困ったものだなあと思ったものだよ私は彼が鹿だという話を彼から聞いた"
    features = analyze_sentence(sentence, nlp)
    print(features)

    #　例文解析
    text="これは例文です。"
    ex_features = analyze_sentence(text, nlp)  # sentenceを解析
    ex_dic = {'ID': 'example', 'Visit': 0, 'Part': 0}
    #ex_dic1 = countMorpheme(text)
    #ex_dic2 = countEntity(text)
    #ex_dic3 = countSyntaxFeatures(text)
    print("ex_features:", ex_features)
    ex_dic.update(**ex_features)
    ex_df = pd.DataFrame(data=ex_dic, index=['val',])
    print(ex_df)

    # UNDERPIN解析
    PATH = '/home/hnakamura/nlp/nestedDependency/RecordVoice_yg_R6Nov22'
    #　インスタンス生成は、pythonProject以下で行うようにすること。注意が必要。
    
    # 修正: ディレクトリが存在するかチェック
    if not os.path.isdir(PATH):
        print(f"Error: Directory {PATH} does not exist or is not accessible")
        exit(1)
    
    # 修正: ディレクトリに移動する前にパスの妥当性を確認
    try:
        os.chdir(PATH)
        print(f"Successfully changedo directory: {PATH}")
    except Exception as e:
        print(f"Error changing to directory {PATH}: {e}")
        exit(1)
    
    dataframe_answer = fileSortForAnalaysisNestedDependency(PATH, ex_df, feature_extractor,nlp)
    print('dataframe_answer:', dataframe_answer)
    dataframe_answer.to_csv('Dataframe_analyzeNestedDependency_integration_R7Aug12_RecordVoice_vf_CEJCminus.csv')
    print(checkFileAnalyzedOrNot(PATH, dataframe_answer))
    print("解析から外れたファイル数:", len(checkFileAnalyzedOrNot(PATH, dataframe_answer))) 

{'max_nesting_relation': 15, 'max_nesting_depth': 3}
ex_features: {'max_nesting_relation': 0, 'max_nesting_depth': 0}
          ID  Visit  Part  max_nesting_relation  max_nesting_depth
val  example      0     0                     0                  0
Successfully changedo directory: /home/hnakamura/nlp/nestedDependency/RecordVoice_yg_R6Nov22
file: ['/home/hnakamura/nlp/nestedDependency/RecordVoice_yg_R6Nov22/UMM029-3_2Kana.txt', '/home/hnakamura/nlp/nestedDependency/RecordVoice_yg_R6Nov22/UKM057-1_3Kana.txt', '/home/hnakamura/nlp/nestedDependency/RecordVoice_yg_R6Nov22/UMM020-2_1Kana.txt', '/home/hnakamura/nlp/nestedDependency/RecordVoice_yg_R6Nov22/UTM018-1_1Kana.txt', '/home/hnakamura/nlp/nestedDependency/RecordVoice_yg_R6Nov22/UBM008-3_1Kana.txt', '/home/hnakamura/nlp/nestedDependency/RecordVoice_yg_R6Nov22/UAM024-4_1Kana.txt', '/home/hnakamura/nlp/nestedDependency/RecordVoice_yg_R6Nov22/UBM001-2_2Kana.txt', '/home/hnakamura/nlp/nestedDependency/RecordVoice_yg_R6Nov22/UTM020-1_3Kan