## Start Up

In [3]:
print(torch.__version__, torch.cuda.is_available())

2.1.0+cu118 True


In [1]:
import io
import os
import os.path
import re
import matplotlib.pyplot as plt
import cv2
import csv
import numpy as np
from PIL import Image
from io import BytesIO
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from googleapiclient.http import MediaIoBaseDownload
import torch
import torchvision

SCOPES = ['https://www.googleapis.com/auth/drive.file']
MIME_TYPE = 'application/vnd.google-apps.document'
APPLICATION_NAME = 'ipa-google-drive-api-client'

In [2]:
cd "C:\Users\covid\text_recognition"

C:\Users\covid\text_recognition


In [3]:
def get_service():

    # credentialの取得
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'google-drive-api.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
        
    # serviceの取得
    service = build('drive', 'v3', credentials=creds) 
    
    return service

def read_ocr(service, input_file, lang='jp'):
    # ファイルのアップロード

    # ローカルファイルの定義
    media_body = MediaFileUpload(input_file, mimetype=MIME_TYPE, resumable=True)

    # Google Drive上のファイル名
    newfile = 'output.pdf'

    body = {
        'name': newfile,
        'mimeType': MIME_TYPE
    }

    # 　creat関数でファイルアップロード実行
    # 同時にOCR読み取りも行う
    output = service.files().create(
        body=body,
        media_body=media_body,
        # ここで読み込み先言語の指定を行う
        ocrLanguage=lang,
    ).execute()

    # テキストファイルのダウンロード

    # リクエストオブジェクト生成
    request = service.files().export_media(
        fileId=output['id'],
        mimeType="text/plain"
    )
    output_path = 'output.txt'

    with open(output_path, 'a') as f:
        fh = io.FileIO(output_path, "wb")
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()

        service.files().delete(fileId=output['id']).execute()
    
        # テキストの取得
    with open(output_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # 読み取り結果のリストを返す
    return lines[1:]


service = get_service()

In [4]:
def process_text_file(text_file):
    output_dir = "C:/Users/covid/text_recognition/output"
    if os.path.exists(output_dir):
        file_list = [f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))]
        for file_name in file_list:
            file_path = os.path.join(output_dir, file_name)
            os.remove(file_path)
    else:
        os.makedirs(output_dir)

    with open(text_file, 'r') as file:
        lines = file.readlines()

        lines = sorted(lines, key=lambda line: float(line.split()[1]))

        for i, line in enumerate(lines):
            line = line.strip()
            values = line.split()

            if len(values) == 5:
                object_class = values[0]
                a = float(values[1])
                b = float(values[2])
                c = float(values[3])
                d = float(values[4])

                # Calculate coordinates and dimensions
                x_center = int(wid * a)
                y_center = int(hei * b)
                width = int(wid * c)
                height = int(hei * d)

                x_min = x_center - width // 2
                y_min = y_center - height // 2
                x_max = x_center + width // 2
                y_max = y_center + height // 2

                output_filename = os.path.join(output_dir, f'book{i+1}.jpg')
                index = 1
                while os.path.exists(output_filename):
                    output_filename = os.path.join(output_dir, f'book{i+1}_{index}.jpg')
                    index += 1

                # Crop and save the image
                cropped = image.crop((x_min, y_min, x_max, y_max))
                cropped.save(output_filename)


## roboflowを使用したモデルのデプロイ

In [5]:
import sys

# passの設定 (pip showで出てきた、LocationのPASSを以下に設定)
sys.path.append('c:/users/covid/anaconda3/lib/site-packages')

# passの設定はimportするモジュールより前に設定
import roboflow

In [36]:
#自身のモデル→versions→Export datasetにあるコードのコピペ
!pip install roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="9RdognCaI8Nuh4bFkYHc")
project = rf.workspace("koteitan").project("oriented-books")
version = project.version(1)
dataset = version.download("yolov7")

Collecting roboflow
  Using cached roboflow-1.1.34-py3-none-any.whl.metadata (9.5 kB)
Collecting chardet==4.0.0 (from roboflow)
  Using cached chardet-4.0.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting idna==3.7 (from roboflow)
  Using cached idna-3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting opencv-python-headless==4.10.0.84 (from roboflow)
  Using cached opencv_python_headless-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting requests-toolbelt (from roboflow)
  Using cached requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Using cached roboflow-1.1.34-py3-none-any.whl (76 kB)
Using cached chardet-4.0.0-py2.py3-none-any.whl (178 kB)
Using cached idna-3.7-py3-none-any.whl (66 kB)
Using cached opencv_python_headless-4.10.0.84-cp37-abi3-win_amd64.whl (38.8 MB)
Using cached requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)
Installing collected packages: opencv-python-headless, idna, chardet, requests-toolbelt, roboflow


ERROR: Could not install packages due to an OSError: [WinError 5] アクセスが拒否されました。: 'C:\\Users\\covid\\anaconda3\\envs\\localGPU\\Lib\\site-packages\\cv2\\cv2.pyd'
Consider using the `--user` option or check the permissions.



loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in oriented-books-1 to yolov7pytorch:: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27969/27969 [00:02<00:00, 10926.28it/s]





Extracting Dataset Version Zip to oriented-books-1 in yolov7pytorch:: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1334/1334 [00:00<00:00, 2644.03it/s]


今回作成したモデルはなぜか書籍検知行われず、一方で作成したモデルの使用方法は判明したので今後はroboflow側でモデルのチューニングを行って

In [9]:
import requests

url = 'https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7_training.pt'
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    with open('yolov7_training.pt', 'wb') as f:
        f.write(response.content)
    print('File downloaded successfully')
else:
    print(f'Failed to download file. Status code: {response.status_code}')


File downloaded successfully


In [12]:
cd yolov7

C:\Users\covid\text_recognition\yolov7


In [13]:
!python train.py --device 0 --batch 16 --epochs 55 --data data/coco.yaml --weights 'yolov7_training.pt'

[34m[1mwandb: [0mInstall Weights & Biases for YOLOR logging with 'pip install wandb' (recommended)


YOLOR  v0.1-126-g84932d7 torch 2.1.0+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575.5MB)

Namespace(weights="'yolov7_training.pt'", cfg='', data='data/coco.yaml', hyp='data/hyp.scratch.p5.yaml', epochs=55, batch_size=16, img_size=[640, 640], rect=False, resume=False, nosave=False, notest=False, noautoanchor=False, evolve=False, bucket='', cache_images=False, image_weights=False, device='0', multi_scale=False, single_cls=False, adam=False, sync_bn=False, local_rank=-1, workers=8, project='runs/train', entity=None, name='exp', exist_ok=False, quad=False, linear_lr=False, label_smoothing=0.0, upload_dataset=False, bbox_interval=-1, save_period=-1, artifact_alias='latest', freeze=[0], v5_metric=False, world_size=1, global_rank=-1, save_dir='runs\\train\\exp', total_batch_size=16)
[34m[1mtensorboard: [0mStart with 'tensorboard --logdir runs/train', view at http://localhost:6006/
[34m[1mhyperparameters: [0mlr0=0.01, lrf=0.1, momentum=0.937, weight_decay=0.0005, warmup_epochs=3.0, warmup_

## Execution

In [24]:
#Variable list
# ディレクトリのパス
directory_path = "C:/Users/covid/text_recognition/yolov7/runs/detect"
# 画像ファイルの相対パスを指定
image_relative_path = "input.png"
# テキストファイルの相対パスを指定
text_file_relative_path = "labels/input.txt"

out_path = 'C:/Users/covid/text_recognition/output'
output_file = "C:/Users/covid/text_recognition/output_results.txt"

In [25]:
cap = cv2.VideoCapture(0)

ret, frame = cap.read()
cv2.imwrite("C:/Users/covid/text_recognition/yolov7/input.png",frame)

cap.release()

In [26]:
cd "C:\Users\covid\text_recognition\yolov7"

C:\Users\covid\text_recognition\yolov7


In [27]:
# ディレクトリ内のサブディレクトリのリストを取得
subdirectories = [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]

# サブディレクトリの中で一番新しいものを取得
newest_subdirectory = max(subdirectories, key=lambda d: os.path.getctime(os.path.join(directory_path, d)))

# 最新のサブディレクトリのパスを作成
newest_subdirectory_path = os.path.join(directory_path, newest_subdirectory)

In [28]:
!python detect.py --source C:/Users/covid/text_recognition/yolov7/input.png --weights yolov7-e6e.pt --conf 0.25 --img-size 1280 --device 0 --save-txt

Namespace(weights=['yolov7-e6e.pt'], source='C:/Users/covid/text_recognition/yolov7/input.png', img_size=1280, conf_thres=0.25, iou_thres=0.45, device='0', view_img=False, save_txt=True, save_conf=False, nosave=False, classes=None, agnostic_nms=False, augment=False, update=False, project='runs/detect', name='exp', exist_ok=False, no_trace=False)
Fusing layers... 
 Convert model to Traced-model... 
 traced_script_module saved! 
 model is traced! 

1 person, 1 tv, 1 keyboard, Done. (27.0ms) Inference, (48.5ms) NMS
 The image with the result is saved in: runs\detect\exp70\input.png
Done. (0.458s)


YOLOR  v0.1-126-g84932d7 torch 2.1.0+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575.5MB)

Model Summary: 792 layers, 151687420 parameters, 817020 gradients
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [29]:
# ディレクトリ内のサブディレクトリのリストを取得
subdirectories = [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]

# サブディレクトリの中で一番新しいものを取得
newest_subdirectory = max(subdirectories, key=lambda d: os.path.getctime(os.path.join(directory_path, d)))

# 最新のサブディレクトリのパスを作成
newest_subdirectory_path = os.path.join(directory_path, newest_subdirectory)
# 新しいディレクトリに移動
os.chdir(newest_subdirectory_path)

# 画像ファイルの絶対パスを作成
image_absolute_path = os.path.join(newest_subdirectory_path, image_relative_path)
# テキストファイルの絶対パスを作成
text_file_absolute_path = os.path.join(newest_subdirectory_path, text_file_relative_path)

In [30]:
print()
print(image_absolute_path)


C:/Users/covid/text_recognition/yolov7/runs/detect\exp70\input.png


In [31]:
# 画像をImageクラスのインスタンスに読み込む
image = Image.open(image_absolute_path)
# テキストファイルを読み込む
with open(text_file_absolute_path, 'r') as file:
    text_content = file.read()

In [32]:
# Process the text file
wid,hei = image.size
process_text_file(text_file_absolute_path)

In [33]:
#output corresponding to list format
if __name__ == '__main__':
    output_list = []

    file_list = [filename for filename in os.listdir(out_path) if filename.endswith('.jpg')]
    file_list.sort(key=lambda x: int(''.join(filter(str.isdigit, x))))

    for filename in file_list:
        input_file = os.path.join(out_path, filename)
        output = read_ocr(service, input_file, 'ja')

        # 不要な文字（スペースとバックスラッシュ）を除去して一つの文字列に結合する
        cleaned_output = ''.join(line.strip().replace(' ', '').replace('/', '').replace('\n', '').replace('\\', '') for line in output)

        # 結果をリストに追加
        output_list.append(cleaned_output)

In [34]:
# Save the results to the output file
with open(output_file, 'w', encoding='utf-8') as file:
        for result in output_list:
            file.write(result + '\n')

print(f"Results saved to {output_file}")

Results saved to C:/Users/covid/text_recognition/output_results.txt


## Compare NLP Methods

### Different.SequenceMatcher

In [300]:
cd "C:\\Users\\covid\\text_recognition"

C:\Users\covid\text_recognition


In [301]:
import difflib

def main():
    file_a_path = "output_results.txt"
    file_b_path = "database.txt"

    with open(file_a_path, "r", encoding="utf-8") as file_a:
        lines_a = file_a.readlines()

    with open(file_b_path, "r", encoding="utf-8") as file_b:
        lines_b = file_b.readlines()

    for index_a, text_a in enumerate(lines_a):
        max_similarity = 0.0
        best_match = None
        best_match_text_b = None

        for index_b, text_b in enumerate(lines_b):
            similarity = difflib.SequenceMatcher(None, text_a, text_b).ratio()

            if similarity > max_similarity:
                max_similarity = similarity
                best_match = text_b
                best_match_text_b = text_b

        print(f"Text A (line {index_a + 1}): {text_a.strip()}")
        print(f"Best Match in Text B (line {lines_b.index(best_match) + 1}): {best_match.strip()}")
        print(f"Highest Similarity Ratio: {max_similarity:.4f}\n")

if __name__ == "__main__":
    main()


Text A (line 1): 新明解C言語入門編柴田
Best Match in Text B (line 31): 新・明解C言語入門編柴田望洋SBCreative
Highest Similarity Ratio: 0.6486

Text A (line 2): 解きながら学ぶC言明解きながら学ぶC言語柴田望
Best Match in Text B (line 32): 解きながら学ぶC言語柴田望洋[監修・著]赤尾浩・肘井信一・高木宏典[著]SBCreative
Highest Similarity Ratio: 0.3944

Text A (line 3): プログラミングROSPythonによるロボットアプリケーション開発MorganQuigleyBrianGerkeyWilliamD.Smart
Best Match in Text B (line 35): プログラミングROSPythonによるロボットアプリケーション開発MorganQuigleyBrianGerkeyWilliamD.Smart河田卓志監訳松田晃一、福地正樹、由谷哲夫訳O'REILLY
Highest Similarity Ratio: 0.8324

Text A (line 4): KINECTforWindowsSDKプログラミングC++編
Best Match in Text B (line 41): KINECTforWindowsSDKプログラミングC++編株式会社ゲッシュ中村薰齋藤俊太宮城英人〈著>システム
Highest Similarity Ratio: 0.7045

Text A (line 5): C++111410刷
Best Match in Text B (line 51): C++1114コア言語10刷突破!C++入門書のロングセラー!!~.マイナビ
Highest Similarity Ratio: 0.4400

Text A (line 6): CUDACCUDAの機能・手法をプロフェッショナルプログラミング網羅的に解説!
Best Match in Text B (line 73): CUDACプロフェッショナルプログラミングCUDAの機能・手法を網羅的に解説!
Highest Similarity Ratio: 0.7250

Tex

In [302]:
def main():
    file_a_path = "output_results.txt"
    file_b_path = "database.txt"
    output_file = "output_similarity.txt"

    with open(file_a_path, "r", encoding="utf-8") as file_a:
        lines_a = file_a.readlines()

    with open(file_b_path, "r", encoding="utf-8") as file_b:
        lines_b = file_b.readlines()

    with open(output_file, "w", encoding="utf-8") as output:
        for index_a, text_a in enumerate(lines_a):
            for index_b, text_b in enumerate(lines_b):
                similarity = difflib.SequenceMatcher(None, text_a, text_b).ratio()

                output.write(f"Text A (line {index_a + 1}): {text_a.strip()}\n")
                output.write(f"Text B (line {index_b + 1}): {text_b.strip()}\n")
                output.write(f"Similarity Ratio: {similarity:.4f}\n\n")

if __name__ == "__main__":
    main()


### Levenshtein distance

In [303]:
import Levenshtein

def main():
    file_a_path = "output_results.txt"
    file_b_path = "database.txt"

    with open(file_a_path, "r", encoding="utf-8") as file_a:
        lines_a = file_a.readlines()

    with open(file_b_path, "r", encoding="utf-8") as file_b:
        lines_b = file_b.readlines()

    for index_a, text_a in enumerate(lines_a):
        max_similarity = 0.0
        best_match = None

        for index_b, text_b in enumerate(lines_b):
            similarity = Levenshtein.ratio(text_a.strip(), text_b.strip())

            if similarity > max_similarity:
                max_similarity = similarity
                best_match = text_b

        print(f"Original Text A (line {index_a + 1}): {text_a.strip()}")
        print(f"Best Match for Text A (line {index_a + 1}): {best_match.strip()}")
        print(f"Highest Similarity Ratio: {max_similarity:.4f}\n")

if __name__ == "__main__":
    main()

Original Text A (line 1): 新明解C言語入門編柴田
Best Match for Text A (line 1): 新・明解C言語入門編柴田望洋SBCreative
Highest Similarity Ratio: 0.6286

Original Text A (line 2): 解きながら学ぶC言明解きながら学ぶC言語柴田望
Best Match for Text A (line 2): 解きながら学ぶC言語柴田望洋[監修・著]赤尾浩・肘井信一・高木宏典[著]SBCreative
Highest Similarity Ratio: 0.3768

Original Text A (line 3): プログラミングROSPythonによるロボットアプリケーション開発MorganQuigleyBrianGerkeyWilliamD.Smart
Best Match for Text A (line 3): プログラミングROSPythonによるロボットアプリケーション開発MorganQuigleyBrianGerkeyWilliamD.Smart河田卓志監訳松田晃一、福地正樹、由谷哲夫訳O'REILLY
Highest Similarity Ratio: 0.8304

Original Text A (line 4): KINECTforWindowsSDKプログラミングC++編
Best Match for Text A (line 4): KINECTforWindowsSDKプログラミングC++編株式会社ゲッシュ中村薰齋藤俊太宮城英人〈著>システム
Highest Similarity Ratio: 0.6977

Original Text A (line 5): C++111410刷
Best Match for Text A (line 5): C++1114コア言語10刷突破!C++入門書のロングセラー!!~.マイナビ
Highest Similarity Ratio: 0.4167

Original Text A (line 6): CUDACCUDAの機能・手法をプロフェッショナルプログラミング網羅的に解説!
Best Match for Text A (line 6): CUDACプロフェッショナルプログラミングCU

In [304]:
def main():
    file_a_path = "output_results.txt"
    file_b_path = "database.txt"
    output_file = "levenshtein_similarity.txt"

    with open(file_a_path, "r", encoding="utf-8") as file_a:
        lines_a = file_a.readlines()

    with open(file_b_path, "r", encoding="utf-8") as file_b:
        lines_b = file_b.readlines()

    with open(output_file, "w", encoding="utf-8") as output:
        for index_a, text_a in enumerate(lines_a):
            max_similarity = 0.0
            best_match = None

            for index_b, text_b in enumerate(lines_b):
                similarity = Levenshtein.ratio(text_a.strip(), text_b.strip())

                output.write(f"Text A (line {index_a + 1}): {text_a.strip()}\n")
                output.write(f"Text B (line {index_b + 1}): {text_b.strip()}\n")
                output.write(f"Similarity Ratio: {similarity:.4f}\n\n")

                if similarity > max_similarity:
                    max_similarity = similarity
                    best_match = text_b

            output.write(f"Best Match for Text A (line {index_a + 1}): {best_match.strip()}\n")
            output.write(f"Highest Similarity Ratio: {max_similarity:.4f}\n\n")

if __name__ == "__main__":
    main()


### Jaccard

In [305]:
def jaccard_coefficient(s1, s2):
    set1 = set(s1)
    set2 = set(s2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union > 0 else 0.0

def main():
    file_a_path = "output_results.txt"
    file_b_path = "database.txt"

    with open(file_a_path, "r", encoding="utf-8") as file_a:
        lines_a = file_a.readlines()

    with open(file_b_path, "r", encoding="utf-8") as file_b:
        lines_b = file_b.readlines()

    for index_a, text_a in enumerate(lines_a):
        max_similarity = 0.0
        best_match = None

        for index_b, text_b in enumerate(lines_b):
            similarity = jaccard_coefficient(text_a.strip(), text_b.strip())

            if similarity > max_similarity:
                max_similarity = similarity
                best_match = text_b

        print(f"Original Text A (line {index_a + 1}): {text_a.strip()}")
        print(f"Best Match for Text A (line {index_a + 1}): {best_match.strip()}")
        print(f"Highest Jaccard Coefficient: {max_similarity:.4f}\n")

if __name__ == "__main__":
    main()


Original Text A (line 1): 新明解C言語入門編柴田
Best Match for Text A (line 1): 新・明解C言語入門編柴田望洋SBCreative
Highest Jaccard Coefficient: 0.5000

Original Text A (line 2): 解きながら学ぶC言明解きながら学ぶC言語柴田望
Best Match for Text A (line 2): 解きながら学ぶC言語柴田望洋[監修・著]赤尾浩・肘井信一・高木宏典[著]SBCreative
Highest Jaccard Coefficient: 0.3250

Original Text A (line 3): プログラミングROSPythonによるロボットアプリケーション開発MorganQuigleyBrianGerkeyWilliamD.Smart
Best Match for Text A (line 3): プログラミングROSPythonによるロボットアプリケーション開発MorganQuigleyBrianGerkeyWilliamD.Smart河田卓志監訳松田晃一、福地正樹、由谷哲夫訳O'REILLY
Highest Jaccard Coefficient: 0.6618

Original Text A (line 4): KINECTforWindowsSDKプログラミングC++編
Best Match for Text A (line 4): KINECTforWindowsSDKプログラミングC++編株式会社ゲッシュ中村薰齋藤俊太宮城英人〈著>システム
Highest Jaccard Coefficient: 0.5000

Original Text A (line 5): C++111410刷
Best Match for Text A (line 5): C++1114コア言語10刷突破!C++入門書のロングセラー!!~.マイナビ
Highest Jaccard Coefficient: 0.2069

Original Text A (line 6): CUDACCUDAの機能・手法をプロフェッショナルプログラミング網羅的に解説!
Best Match for Text A (line 6): CUDACプロフ

In [306]:
def main():
    file_a_path = "output_results.txt"
    file_b_path = "database.txt"
    output_file = "jaccard_similarity.txt"

    with open(file_a_path, "r", encoding="utf-8") as file_a:
        lines_a = file_a.readlines()

    with open(file_b_path, "r", encoding="utf-8") as file_b:
        lines_b = file_b.readlines()

    with open(output_file, "w", encoding="utf-8") as output:
        for index_a, text_a in enumerate(lines_a):
            for index_b, text_b in enumerate(lines_b):
                similarity = jaccard_coefficient(text_a.strip(), text_b.strip())

                output.write(f"Text A (line {index_a + 1}): {text_a.strip()}\n")
                output.write(f"Text B (line {index_b + 1}): {text_b.strip()}\n")
                output.write(f"Jaccard Coefficient: {similarity:.4f}\n\n")

if __name__ == "__main__":
    main()


In [307]:
def jaccard_coefficient(s1, s2):
    set1 = set(s1)
    set2 = set(s2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union > 0 else 0.0

def main():
    file_a_path = "output_results.txt"
    file_b_path = "database.txt"

    with open(file_a_path, "r", encoding="utf-8") as file_a:
        lines_a = file_a.readlines()

    with open(file_b_path, "r", encoding="utf-8") as file_b:
        lines_b = file_b.readlines()

    for index_a, text_a in enumerate(lines_a):
        # 空白の行をスキップ
        if not text_a.strip():
            continue

        max_similarity = 0.0
        best_match = None

        for index_b, text_b in enumerate(lines_b):
            similarity = jaccard_coefficient(text_a.strip(), text_b.strip())

            if similarity > max_similarity:
                max_similarity = similarity
                best_match = text_b

        print(f"Original Text A (line {index_a + 1}): {text_a.strip()}")
        
        if best_match is not None:
            print(f"Best Match for Text A (line {index_a + 1}): {best_match.strip()}")
            print(f"Highest Jaccard Coefficient: {max_similarity:.4f}\n")
        else:
            print("No matching text found.\n")

if __name__ == "__main__":
    main()


Original Text A (line 1): 新明解C言語入門編柴田
Best Match for Text A (line 1): 新・明解C言語入門編柴田望洋SBCreative
Highest Jaccard Coefficient: 0.5000

Original Text A (line 2): 解きながら学ぶC言明解きながら学ぶC言語柴田望
Best Match for Text A (line 2): 解きながら学ぶC言語柴田望洋[監修・著]赤尾浩・肘井信一・高木宏典[著]SBCreative
Highest Jaccard Coefficient: 0.3250

Original Text A (line 3): プログラミングROSPythonによるロボットアプリケーション開発MorganQuigleyBrianGerkeyWilliamD.Smart
Best Match for Text A (line 3): プログラミングROSPythonによるロボットアプリケーション開発MorganQuigleyBrianGerkeyWilliamD.Smart河田卓志監訳松田晃一、福地正樹、由谷哲夫訳O'REILLY
Highest Jaccard Coefficient: 0.6618

Original Text A (line 4): KINECTforWindowsSDKプログラミングC++編
Best Match for Text A (line 4): KINECTforWindowsSDKプログラミングC++編株式会社ゲッシュ中村薰齋藤俊太宮城英人〈著>システム
Highest Jaccard Coefficient: 0.5000

Original Text A (line 5): C++111410刷
Best Match for Text A (line 5): C++1114コア言語10刷突破!C++入門書のロングセラー!!~.マイナビ
Highest Jaccard Coefficient: 0.2069

Original Text A (line 6): CUDACCUDAの機能・手法をプロフェッショナルプログラミング網羅的に解説!
Best Match for Text A (line 6): CUDACプロフ