In [2]:
import cv2
import pytesseract
from pytesseract import Output
from collections import Counter
from src.parse_text_files import create_skills_list, calculate_bar_length_and_ratio
import pandas as pd

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

In [5]:
cv_image_path =  './data/resume examle.png'

In [6]:
# Load the image
image = cv2.imread(cv_image_path)
# Convert to grayscale
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Use binary thresholding to aid in OCR and bar detection
_, thresh_image = cv2.threshold(gray_image, 165, 255, cv2.THRESH_BINARY_INV)
 # OCR to extract text and their bounding boxes
data = pytesseract.image_to_data(thresh_image, output_type=Output.DICT)

In [37]:
# Assuming 'data' is the output dictionary from pytesseract OCR
skills_list = create_skills_list(data, 'SKILLS', 'CERTIFICATES')
print(skills_list)

['project management', 'teamwork', 'time management', 'emotional intelligence', 'public speaking', 'leadership', 'critical thinking', 'storytelling', 'marketing', 'advertising']


In [38]:
keywords = []
for skill in skills_list:
    # get the last word in the string 
    skill_name = skill.split()[-1]
    keywords.append(skill_name)

In [39]:
keywords = list(set(keywords))

In [58]:
temp_list = []

for text_index in range(len(data['text'])):
    if data['text'][text_index].strip().lower() in keywords:
        skill = data['text'][text_index].strip().lower()
        (x, y, w, h) = (data['left'][text_index], data['top'][text_index], data['width'][text_index], data['height'][text_index])
        # print(f"left: {x}, top: {y}, width: {w}, height: {h}")
        
        # Define the expected region where the bar would be based on the text's bounding box
        bar_region_start_x = x + w + 20  # 20 pixels margin; adjust as necessary
        bar_region_end_x = image.shape[1]  # end of the image width

        # The y-coordinate to scan horizontally, typically the vertical middle of the text
        middle_y = y + h // 2
        
        # print("middle_y: ", middle_y)
        # print("bar_region_start_x: ", bar_region_start_x)

        # Extract the line of pixel intensities from the grayscale image
        line_pixels = gray_image[middle_y][bar_region_start_x:bar_region_end_x]
        
        # Calculate the most common non-white value, which we'll call the bar_value. 
        # This represents the intensity of light in a pixel, which is used to determine the length of the bar.
        line_pixels = line_pixels[line_pixels != 255]
        counter = Counter(line_pixels)
        bar_value = counter.most_common(1)[0][0]
        
        # Calculate the bar length and ratio
        bar_length, bar_value_count, ratio = calculate_bar_length_and_ratio(line_pixels, bar_value)
        
        temp_list.append((skill, ratio))
        
        # print(f"Skill: {skill}")
        # print(f"Ratio: {ratio * 100:.2f}%")
        # print('_________'*10)

In [56]:
summary = pd.DataFrame(temp_list, columns=['Skill', 'Ratio']).groupby('Skill').max('Ratio').sort_values(by='Ratio', ascending=False).reset_index()

ratio_lookup = dict(zip(summary['Skill'], summary['Ratio']))

mapped_skills = []

for skill in skills_list:
    # Extract the last word of the skill
    last_word = skill.split()[-1]
    # Lookup the ratio using the last word of the skill
    ratio = ratio_lookup.get(last_word, None)  # Returns None if the last word is not found
    # Append the skill and ratio to the list
    mapped_skills.append((skill, ratio))

# Create the new DataFrame
mapped_skills_df = pd.DataFrame(mapped_skills, columns=['Skill', 'Ratio'])


In [57]:
mapped_skills_df

Unnamed: 0,Skill,Ratio
0,project management,1.0
1,teamwork,0.968127
2,time management,1.0
3,emotional intelligence,0.941909
4,public speaking,0.879518
5,leadership,0.882591
6,critical thinking,0.908367
7,storytelling,0.844898
8,marketing,0.810811
9,advertising,0.351827
