In [None]:
import camelot.io as camelot
from camelot import plotting
import tkinter
import os
import json 
import warnings


#ignoring warnings
warnings.filterwarnings("ignore", category=UserWarning, module="camelot.parsers.stream")

# loading our pdf via camelot
tables = camelot.read_pdf('2024-TR_UNIT_PRICES_ENG.pdf',pages="2-end", flavor='stream')

print(f"Data from the PDF has been converted into {tables.n} tables.")

In [2]:
tables.export('foo.csv', f='csv', compress=True)

In [None]:
tables[10].df   ## data converted some tables. and we can check 10th table  for example

In [None]:
tables[132].parsing_report

In [None]:
# looking at shapes of tables that we created

for i in range(0,720):
    print(f"table[{i}] is  {tables[i].df.shape}")


### Getting json file from pdf


In [27]:
#  We turn our pdf to csv above codes. 
#    NOW IN THIS CELL,  
#      we are getting a json source belongs to our pdf.

all_tables = []

for i in range(720):
    temp_json_path = f"jsons/temp_{i}.json"
    tables[i].to_json(temp_json_path)
    
    with open(temp_json_path, 'r') as file:
        table_dict = json.load(file)
        if table_dict: 
            all_tables.append(table_dict)
    
    os.remove(temp_json_path) 

with open("all_tables.json", "w") as json_file:
    json.dump(all_tables, json_file, indent=4)


# All the codes below are about editing the generated CSV files. What we did in the cell above was to convert all the tables into a single JSON file that we got from the PDF. 

In [8]:
# preparing the source data 
for i in range(3,582):
    tables[i].df = tables[i].df.drop(index=0).reset_index(drop=True)
    tables[i].df = tables[i].df.rename(columns={0: "id", 1: "description", 2: "unit", 3: "price"})
for i in range(1,582):
    if "id" in tables[i].df.columns:
        tables[i].df = tables[i].df.drop(columns=['id'])
for i in range(1, 582):
    if len(tables[i].df.columns) == 2:
        tables[i].df = tables[i].df.rename(columns={'unit': 'price'})
        
for i in range(1, 582):
    if len(tables[i].df.columns) == 4:
        tables[i].df = tables[i].df.rename(columns={'price': 'purchased at', 4: 'price'})

In [8]:
tables[0].df = tables[0].df.drop(index=0).reset_index(drop=True)

In [9]:
tables[0].df =  tables[0].df.rename(columns={0: "id", 1: "description", 2: "unit", 3: "price"})

In [None]:
tables[300].df.to_csv("cleaned_data.txt", index=False, sep='\t')

print("The DataFrame was successfully saved to the cleaned_data.txt file.")

In [38]:
missing_price_indices = []
for i in range(1, 583):  # 1'den 582'ye kadar
    if 'price' not in tables[i].df.columns:
        missing_price_indices.append(i)

In [37]:
 
for i in missing_price_indices:
    tables[i].df = tables[i].df.rename(columns={0: "id", 1: "description", 2: "unit", 3: "price"})

In [45]:
for i in range(1, 583):
    if i not in missing_price_indices:# 1'den 582'ye kadar
        tables[i].df['price'] = tables[i].df['price'].astype(str) + " TRY"

In [54]:
for i in range(1, 583):  # 1'den 582'ye kadar
    df = tables[i].df
    if 'price' in df.columns:
        
        df['price'] = df['price'].apply(lambda x: "" if x.strip() == "TRY" else x)

In [58]:
tables[300].df.to_csv("cleaned_data.txt", index=False, sep='\t')

print("The DataFrame was successfully saved to the cleaned_data.txt file..")

DataFrame başarılı bir şekilde cleaned_data.txt dosyasına kaydedildi.


In [60]:

input_file = "cleaned_data.csv"
output_file = "structured_output.txt"
with open(input_file, mode='r', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file)
    with open(output_file, mode='w', encoding='utf-8') as txt_file:
        current_heading = None
        for row in csv_reader:
            if len(row) == 1 and row[0]:  # This row is a heading
                current_heading = row[0]
                txt_file.write(f"{current_heading}\n")
            elif len(row) > 1:  # This row is data under the current heading
                if current_heading:
                    txt_file.write(f"    {row[0]} - {row[1]} - {row[2]}\n")

                    
print("Conversion complete.")

Conversion complete.


In [None]:

output_folder = 'text_files'
os.makedirs(output_folder, exist_ok=True)

for i in range(1, 583):
    df = tables[i].df
    
    text = df.to_string()
    
    file_name = f'{output_folder}/dataframe_{i}.txt'
    
    # Metin dosyasına yazarken UTF-8 kodlaması kullanın
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(text)

print("All DataFrames have been converted to text files and saved.")

In [None]:


output_folder = 'text_files'
collected_file_name = 'collected.txt'

with open(collected_file_name, 'w', encoding='utf-8') as collected_file:
    for i in range(1, 583):
        file_name = f'{output_folder}/dataframe_{i}.txt'
        
        try:
            with open(file_name, 'r', encoding='utf-8') as file:
                content = file.read()
                collected_file.write(f'--- Content of dataframe_{i}.txt ---\n')
                collected_file.write(content)
                collected_file.write('\n\n') 
        except FileNotFoundError:
            print(f'{file_name} not found, this file has been skipped.')

print(f"All text files are consolidated into a single file: {collected_file_name}")

for i in range(1, 583):
    file_name = f'{output_folder}/dataframe_{i}.txt'
    try:
        os.remove(file_name)
        print(f'{file_name} silindi.')
    except FileNotFoundError:
        print(f'{file_name} not found, this file may have already been deleted.')

print("All text files have been deleted.")
