In [None]:
import os
import shutil

def organize_files(input_folder, output_folder):
    # フォルダパスを定義
    excel_folder = os.path.join(output_folder, 'Excels')
    pptx_folder = os.path.join(output_folder, 'PPTXs')
    pdf_folder = os.path.join(output_folder, 'PDFs')
    other_folder = os.path.join(output_folder, 'Others')

    # 必要なフォルダを作成
    os.makedirs(excel_folder, exist_ok=True)
    os.makedirs(pptx_folder, exist_ok=True)
    os.makedirs(pdf_folder, exist_ok=True)
    os.makedirs(other_folder, exist_ok=True)

    # 入力フォルダ内のすべてのファイルとサブフォルダを再帰的に処理
    for root, _, files in os.walk(input_folder):
        for filename in files:
            input_path = os.path.join(root, filename)

            # ファイル拡張子を取得
            file_extension = os.path.splitext(filename)[1].lower()

            if file_extension == '.xlsx':
                # ExcelファイルをExcelフォルダにコピー
                shutil.copy2(input_path, os.path.join(excel_folder, filename))
                print(f"Copied {filename} to Excel folder.")

            elif file_extension == '.pptx':
                # PPTXファイルをPPTXsフォルダにコピー
                shutil.copy2(input_path, os.path.join(pptx_folder, filename))
                print(f"Copied {filename} to PPTX folder.")

            elif file_extension == '.pdf':
                # PDFファイルをPDFフォルダにコピー
                shutil.copy2(input_path, os.path.join(pdf_folder, filename))
                print(f"Copied {filename} to PDF folder.")

            else:
                # その他のファイルをOtherフォルダにコピー
                shutil.copy2(input_path, os.path.join(other_folder, filename))
                print(f"Copied {filename} to Other folder.")

# 入力フォルダと出力フォルダのパスを指定
input_folder = '../data/original_docs'
output_folder = '../data'

# ファイルを整理
organize_files(input_folder, output_folder)


In [None]:
import os
import json
import pandas as pd
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from dotenv import load_dotenv

# 環境変数の読み込み
load_dotenv()

# 各ファイルの変換関数
def excel_to_json(file_path):
    dfs = pd.read_excel(file_path, sheet_name=None)
    data = {}
    for sheet_name, df in dfs.items():
        data[sheet_name] = df.to_dict(orient='records')
    
    metadata = {
        "filename": os.path.basename(file_path),
        "file_path": file_path,
        "file_size": os.path.getsize(file_path),
        "sheet_count": len(dfs),
        "created_time": os.path.getctime(file_path),
        "modified_time": os.path.getmtime(file_path)
    }
    return {"content": data, "metadata": metadata}

def pdf_to_json(file_path):
    reader = PdfReader(file_path)
    data = {}
    for i, page in enumerate(reader.pages):
        data[f'page_{i+1}'] = page.extract_text()
    
    metadata = {
        "filename": os.path.basename(file_path),
        "file_path": file_path,
        "file_size": os.path.getsize(file_path),
        "page_count": len(reader.pages),
        "created_time": os.path.getctime(file_path),
        "modified_time": os.path.getmtime(file_path),
        "tags": ["Expanded Edition", "Product Specialist"]
    }
    return {"content": data, "metadata": metadata}

def pptx_to_json(file_path):
    slides = UnstructuredPowerPointLoader(file_path).load()
    data = {"slides": [{"page_content": slide.page_content, "metadata": slide.metadata} for slide in slides]}
    
    metadata = {
        "filename": os.path.basename(file_path),
        "file_path": file_path,
        "file_size": os.path.getsize(file_path),
        "slide_count": len(slides),
        "created_time": os.path.getctime(file_path),
        "modified_time": os.path.getmtime(file_path)
    }
    return {"content": data, "metadata": metadata}

def html_to_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    for script in soup(['script', 'style']):
        script.decompose()
    text_content = soup.get_text(separator='\n').strip()

    metadata = {
        "filename": os.path.basename(file_path),
        "file_path": file_path,
        "file_size": os.path.getsize(file_path),
        "created_time": os.path.getctime(file_path),
        "modified_time": os.path.getmtime(file_path)
    }
    return {"content": text_content, "metadata": metadata}

# ファイルをJSONに変換し、「all_JSONs」フォルダにコピー
def convert_files_to_json(input_folder, output_folder):
    converters = {
        "Excels": excel_to_json,
        "PPTXs": pptx_to_json,
        "PDFs": pdf_to_json,
        "HTMLs": html_to_json
    }

    all_json_folder = os.path.join(output_folder, 'all_JSONs')
    os.makedirs(all_json_folder, exist_ok=True)

    for folder_name, convert_func in converters.items():
        print(f"Processing folder: {folder_name}")
        folder = os.path.join(input_folder, folder_name)
        if os.path.exists(folder):
            for file_name in os.listdir(folder):
                file_path = os.path.join(folder, file_name)
                if os.path.isdir(file_path):
                    continue
                json_file_path = os.path.join(all_json_folder, f"{os.path.splitext(file_name)[0]}.json")

                if os.path.exists(json_file_path):
                    print(f"JSON file already exists, overwriting: {json_file_path}")

                json_data = convert_func(file_path)
                with open(json_file_path, 'w', encoding='utf-8') as json_file:
                    json.dump(json_data, json_file, ensure_ascii=False, indent=4)
                print(f"Converted and copied file: {file_path} to {json_file_path}")

# 入力フォルダと出力フォルダのパスを指定
input_folder = '../data'
output_folder = '../data'

# ファイルをJSONに変換してコピー
convert_files_to_json(input_folder, output_folder)
