In [1]:
import fitz  # PyMuPDF
import os
from tqdm import tqdm 
import openai


In [2]:
def get_filename_without_extension(file_path):
    return os.path.splitext(os.path.basename(file_path))[0]

In [3]:
#フォルダパス内のファイルパスを取得する関数
def get_file_paths(folder_path):
    file_paths = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
    return file_paths

In [4]:
#PDFからドキュメントを取得
def read_pdf(file_path):
    pdf_document = fitz.open(file_path)
    full_text = ""
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        full_text += page.get_text("text")
    return full_text


In [18]:
#set_path

pdf_folder_path = "pdf/"
text_folder_path = "text/"
pdf_text_folder_path = text_folder_path + "pdf_text/"
pdf_summarize_text_folder_path = text_folder_path + "pdf_summarize_text/"

### 1.abstract text from pdf
- https://qiita.com/akiraokusawa/items/ba83893669484d33067c

In [19]:
# get_path
pdf_paths = get_file_paths(pdf_folder_path)

In [7]:
# #save_text
# for pdf_path in tqdm(pdf_paths):
#     text = read_pdf(pdf_path)
#     file_name = get_filename_without_extension(pdf_path)
#     with open( pdf_text_folder_path+ f'{file_name}.txt', 'w', encoding='utf-8') as f:
#         f.write(text)
#         f.close()

### 2.summarize text with 4omini

In [21]:
import os
import base64
import numpy as np
from openai import AzureOpenAI
from dotenv import load_dotenv
load_dotenv()

import time

import re
from janome.tokenizer import Tokenizer
import unicodedata

In [22]:
#set env
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
API_VERSION = os.getenv("API_VERSION")
DEPLOYMENT_ID_FOR_CHAT_COMPLETION = os.getenv("DEPLOYMENT_ID_FOR_CHAT_COMPLETION")

In [23]:
def get_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

In [24]:

def generate_summary(text):
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"Summarize the following text : \"{text}\""}
        # {"role": "user", "content": f"Summarize the following text with emphasis on keywords like revenue, profit, sales performance, market trends, issues, improvements, and personnel changes: \"{text}\""}
    ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages
    )
    
    return response.choices[0].message["content"]

In [25]:
def summarize_text(text):
    # content =  [
    #     {"role": "system", "content": "You are a helpful assistant."},

    #     {"role": "user", "content": f"Summarize the following text : \"{text}\""}
    #     ]

    content =  [
        {"role": "system", "content": "あなたはテキストを要約してmarkdown形式で出力します。"},

        {"role": "user", "content": f"次の文章を要約してください。ただし特殊な記号や改行コードは削除してから要約してください。 : \"{text}\""}
        ]


    client = AzureOpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        api_version=API_VERSION,
        azure_endpoint=AZURE_OPENAI_ENDPOINT)

    completion_resp = client.chat.completions.create(
        model=DEPLOYMENT_ID_FOR_CHAT_COMPLETION,
        temperature=0,
        seed=42,
        messages=content
        )
    
    return completion_resp.choices[0].message.content

In [33]:
# テキストの正規化
def normalize_text(text):
    # 全角文字を半角文字に変換
    text = unicodedata.normalize('NFKC', text)
    return text

# 特殊文字と改行コードの除去
def remove_special_characters(text):
    # 改行コード、特殊文字、数字を除去（必要に応じて正規表現を調整）
    text = re.sub(r'\n|\r', '', text)
    text = re.sub(r'[^ぁ-んァ-ン一-龥a-zA-Z0-9\s]', '', text)
    return text

def contains_hiragana(text):
    # ひらがなを判定するための正規表現
    hiragana_pattern = re.compile('[\u3040-\u309F]')
    
    # テキストにひらがなが含まれるかをチェック
    if hiragana_pattern.search(text):
        return True
    else:
        return False

def delete_few_info_lines(text):
    # 改行コードで分割してリストに格納
    lines = text.split('\n')
    # 10文字以下の行を削除
    filtered_lines = [line for line in lines if len(line) > 20]
    # フィルタリングされた行を再結合して1つの文字列にする
    result = '\n'.join(filtered_lines)
    return result

def delete_url_lines(text):
    # 改行コードで分割してリストに格納
    lines = text.split('\n')

    filtered_lines = [line for line in lines if not "http" in line]

    # フィルタリングされた行を再結合して1つの文字列にする
    result = '\n'.join(filtered_lines)
    return result

def delete_reference_lines(text):
    # 改行コードで分割してリストに格納
    lines = text.split('\n')

    filtered_lines = [line for line in lines if not line.startswith("※")]

    # フィルタリングされた行を再結合して1つの文字列にする
    result = '\n'.join(filtered_lines)
    return result

def delete_not_document_lines(text):
    #ひらがなが含まれない（文章ではない）行を削除
    lines = text.split('\n')

    filtered_lines = [line for line in lines if contains_hiragana(line)]

    # フィルタリングされた行を再結合して1つの文字列にする
    result = '\n'.join(filtered_lines)
    return result



# 前処理関数
def preprocess_japanese_text(text):
    text = normalize_text(text)
    text = delete_few_info_lines(text)
    text = delete_not_document_lines(text)
    text = delete_url_lines(text)
    text = delete_reference_lines(text)
    text = remove_special_characters(text)
    
    # tokenizer = Tokenizer()
    # tokens = tokenizer.tokenize(text, wakati=True)
    # processed_text = ' '.join(tokens)
    return text


In [34]:
import tiktoken


encoding_35 = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
encoding_4 = tiktoken.encoding_for_model("gpt-4-0314")

def calc_token(chat, encoding_name):
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(chat))
    return num_tokens




In [35]:
# get_path
text_paths = get_file_paths(pdf_text_folder_path)


In [44]:
#save_text
for text_path in tqdm(text_paths):
    file_name = get_filename_without_extension(text_path)
    text = get_text_from_file(text_path)
    text = text.replace(" ","")
    text = preprocess_japanese_text(text)


    while True:
        token = calc_token(text, encoding_4.name)
        print(file_name,len(text),token)
        if token <170000:
            break
        else:
            text = text[:-1000]

    calc_token(text, encoding_4.name)

    summary = summarize_text(text)
    with open( pdf_summarize_text_folder_path+ f'{file_name}.txt', 'w', encoding='utf-8') as f:
        f.write(summary)
        f.close()
    

  0%|          | 0/19 [00:00<?, ?it/s]

1 10411 11790


  5%|▌         | 1/19 [00:05<01:36,  5.34s/it]

10 166163 185877
10 165163 184816
10 164163 183701
10 163163 182584
10 162163 181519
10 161163 180469
10 160163 179371
10 159163 178474
10 158163 177423
10 157163 176276
10 156163 175143
10 155163 174019
10 154163 172850
10 153163 171614
10 152163 170454
10 151163 169304


 11%|█         | 2/19 [00:41<06:35, 23.27s/it]

11 52504 59891


 16%|█▌        | 3/19 [00:51<04:38, 17.40s/it]

12 40326 45840


 21%|██        | 4/19 [01:00<03:31, 14.08s/it]

13 38793 44129


 26%|██▋       | 5/19 [01:07<02:39, 11.43s/it]

14 26710 30953


 32%|███▏      | 6/19 [01:14<02:11, 10.08s/it]

15 35233 41948


 37%|███▋      | 7/19 [01:22<01:50,  9.22s/it]

16 52048 58367


 42%|████▏     | 8/19 [01:31<01:41,  9.23s/it]

17 55249 61399


 47%|████▋     | 9/19 [01:42<01:37,  9.79s/it]

18 46975 51860


 53%|█████▎    | 10/19 [01:55<01:36, 10.67s/it]

19 63901 71378


 58%|█████▊    | 11/19 [02:06<01:28, 11.03s/it]

2 70738 79481


 63%|██████▎   | 12/19 [02:20<01:22, 11.81s/it]

3 126192 140116


 68%|██████▊   | 13/19 [02:32<01:11, 11.91s/it]

4 60372 67921


 74%|███████▎  | 14/19 [02:40<00:53, 10.72s/it]

5 37690 42958


 79%|███████▉  | 15/19 [02:48<00:38,  9.71s/it]

6 52329 60180


 84%|████████▍ | 16/19 [02:58<00:29,  9.92s/it]

7 67461 76491


 89%|████████▉ | 17/19 [03:08<00:19,  9.93s/it]

8 148998 163103


 95%|█████████▍| 18/19 [03:35<00:15, 15.19s/it]

9 55850 63394


100%|██████████| 19/19 [03:46<00:00, 11.93s/it]
