# Задачи на проверку, возможно ли определить читаемость pdf

### Импорты

In [61]:
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from supabase import create_client, Client
from dotenv import load_dotenv

import PyPDF2
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
from PIL import Image
from pdf2image import convert_from_path

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


### SNS settings

In [2]:
sns.set(style="whitegrid")
sns.set_palette("pastel")

### Supabase settings

In [62]:
load_dotenv()

url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(url, key)

### Считываем пути для данных

In [50]:
df = pd.DataFrame(columns=["Readable_books"])
Readable_books_paths = []
# Add books paths to df from Readable folder
for root, dirs, files in os.walk("./data/readable"):
    for file in files:
        if file.endswith(".pdf"):
            Readable_books_paths.append(os.path.join(root, file))

# Add paths to df
df["Readable_books"] = Readable_books_paths

In [51]:
df.head()

Unnamed: 0,Readable_books
0,./data/readable\OOP_s_pom_ Python.pdf
1,./data/readable\PyCharm_Профессиональная_работ...
2,./data/readable\PyCharm_Профессиональная_работ...
3,./data/readable\Python 3. Самое необходимое. 2...
4,./data/readable\Python. Наиболее полное руково...


### Получить текст из PDF

In [52]:
def extract_text_from_pdf(pdf_path):
    from pdfminer.high_level import extract_pages
    from pdfminer.layout import LTTextContainer
    text = []
    for page_layout in extract_pages(pdf_path):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                text.append(element.get_text())
    return text

In [53]:
df["text"] = df["Readable_books"].parallel_apply(extract_text_from_pdf)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4), Label(value='0 / 4'))), HBox(c…

In [55]:
df.to_excel("books_text.xlsx", index=False, engine='openpyxl')

In [34]:
df.head()

Unnamed: 0,Readable_books,Unreadable_books,text
0,./data/readable\OOP_s_pom_ Python.pdf,./data/unreadable\Язык программирования Python...,I R V K A L B\nOBJECT-ORIENTED \nPYTHON\nMAS...
1,./data/readable\PyCharm_Профессиональная_работ...,,Брюс М. Ван Хорн II\nКуан Нгуен\nPyCharm: \nпр...
2,./data/readable\PyCharm_Профессиональная_работ...,,Брюс М. Ван Хорн II\nКуан Нгуен\nPyCharm: \nпр...
3,./data/readable\Python 3. Самое необходимое. 2...,,4\nОглавление\n4.4. Функции rangeO и enumerate...
4,./data/readable\Python. Наиболее полное руково...,,(cid:1)(cid:3)(cid:4)(cid:5)(cid:6)(cid:7)(cid...


In [67]:
# Create supabase insert to add data from df to supabase
# Table is called books (id, name, book_text)
for index, row in df.iterrows():
    supabase.table("books").insert([{
        "name": row["Readable_books"],
        "book_text": row["text"]
    }]).execute()

2024-03-12 17:51:27,541:INFO - HTTP Request: POST https://kapxuliiifmyfljppsyp.supabase.co/rest/v1/books "HTTP/1.1 201 Created"
2024-03-12 17:51:30,164:INFO - HTTP Request: POST https://kapxuliiifmyfljppsyp.supabase.co/rest/v1/books "HTTP/1.1 201 Created"
2024-03-12 17:51:32,508:INFO - HTTP Request: POST https://kapxuliiifmyfljppsyp.supabase.co/rest/v1/books "HTTP/1.1 201 Created"
2024-03-12 17:51:34,763:INFO - HTTP Request: POST https://kapxuliiifmyfljppsyp.supabase.co/rest/v1/books "HTTP/1.1 201 Created"
2024-03-12 17:51:37,970:INFO - HTTP Request: POST https://kapxuliiifmyfljppsyp.supabase.co/rest/v1/books "HTTP/1.1 201 Created"
2024-03-12 17:51:41,285:INFO - HTTP Request: POST https://kapxuliiifmyfljppsyp.supabase.co/rest/v1/books "HTTP/1.1 201 Created"
2024-03-12 17:51:42,906:INFO - HTTP Request: POST https://kapxuliiifmyfljppsyp.supabase.co/rest/v1/books "HTTP/1.1 201 Created"
2024-03-12 17:51:44,315:INFO - HTTP Request: POST https://kapxuliiifmyfljppsyp.supabase.co/rest/v1/books

In [83]:
# Select first row from books table
response = supabase.table("books")\
    .select("name, book_text")\
    .execute()

2024-03-12 18:06:04,723:INFO - HTTP Request: GET https://kapxuliiifmyfljppsyp.supabase.co/rest/v1/books?select=name%2C%20book_text "HTTP/1.1 200 OK"


In [91]:
print(len(response.data[0].get("book_text")))
print(len(df["text"].iloc[0]))

6770
6770
