-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
112 lines (93 loc) · 3.77 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import subprocess
import sys
import pandas as pd
from PIL import Image
import pytesseract
import openpyxl
import logging
# This helps in determining if we need to install a module
def is_module_installed(module_name):
try:
__import__(module_name)
return True
except ImportError:
return False
# This function is called if a required module is not installed
def install_module(module_name):
try:
subprocess.check_call([sys.executable, "-m", "pip", "install", module_name])
except subprocess.CalledProcessError as e:
logging.error(f"Error installing {module_name}: {e}")
# Ensures all necessary modules are available for the script to run
def install_required_modules():
required_modules = ["pytesseract", "openpyxl", "pandas"]
for module in required_modules:
if not is_module_installed(module):
install_module(module)
# Function to extract text from an image
def extract_text_from_image(img_path):
try:
text = pytesseract.image_to_string(Image.open(img_path))
logging.info(f"Text extraction successful from {img_path}")
return text.strip()
except Exception as e:
logging.error(f"Error occurred while extracting text from {img_path}: {e}")
return None
'''
This function processes the text to extract
relevant information and saves it to an Excel file
'''
def parse_text_to_excel(text, output, imagefiles):
if text is None:
logging.warning("No text extracted. Excel file will not be created.")
return
try:
lines = text.split('\n')
lines = [line for line in lines if line.strip()]
# (at least 6 lines)
if len(lines) < 6:
logging.warning(f"Extracted text from {imagefiles} does not contain enough data to process.")
return
# Data structure to hold contact names and times seen
data = {"Contact Name": [], "Time Seen": []}
for i in range(5, len(lines), 2):
if i + 1 < len(lines):
data["Contact Name"].append(lines[i])
data["Time Seen"].append(lines[i + 1])
else:
logging.warning(f"Incomplete data for contact name {lines[i]} in {imagefiles}")
if not data["Contact Name"] or not data["Time Seen"]:
logging.warning(f"No valid data extracted from {imagefiles}. Skipping file creation.")
return
df = pd.DataFrame(data)
excel_file = os.path.join(output, f"{imagefiles.split('.')[0]}.xlsx")
# Save the DataFrame to an Excel file
if not os.path.exists(excel_file):
df.to_excel(excel_file, index=False)
logging.info(f"Excel file created at {excel_file}")
else:
with pd.ExcelWriter(excel_file, engine='openpyxl', mode='a') as writer:
df.to_excel(writer, index=False, header=False)
logging.info(f"Data appended to existing Excel file at {excel_file}")
except PermissionError:
logging.error(f"Permission denied: '{excel_file}'")
except Exception as e:
logging.error(f"Error occurred while saving data to Excel: {e}")
# Main function to drive the script
def main():
logging.basicConfig(level=logging.INFO)
# install modules if not already exist
install_required_modules()
# input and output directories
inputfolder = "images"
output = "output"
if not os.path.exists(output):
os.makedirs(output)
for imagefiles in os.listdir(inputfolder):
if imagefiles.lower().endswith((".png", ".jpg", ".jpeg")):
img_path = os.path.join(inputfolder, imagefiles)
extracted_text = extract_text_from_image(img_path)
parse_text_to_excel(extracted_text, output, imagefiles)
if __name__ == "__main__":
main()