In [18]:
import os
import requests
from PIL import Image
import pytesseract
import pickle
import time
screenshot_path = './captured/tda_lab_member_page.png'

In [19]:
# Function to perform OCR on the screenshot
def perform_ocr(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

# Function to classify names using the loaded model
def classify_names(names, vectorizer, model):
    turkish_names = []
    for name in names:
        name_vectorized = vectorizer.transform([name])
        is_turkish = model.predict(name_vectorized)[0] == 1
        if is_turkish:
            turkish_names.append(name)
    return turkish_names

# Load your trained model and vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

with open('logistic_regression_model.pkl', 'rb') as f:
    clf = pickle.load(f)

In [20]:
extracted_text = perform_ocr(screenshot_path)

# Save the extracted text to a file
with open('extracted_text.txt', 'w') as file:
    file.write(extracted_text)

In [21]:
import spacy

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

# Your text goes here
# load the text from the file
with open('extracted_text.txt', 'r') as file:
    text = file.read()

# Process the text
doc = nlp(text)

# Extract and print names
extracted_names = set()  # Using a set to avoid duplicate names

for ent in doc.ents:
    if ent.label_ == 'PERSON':
        extracted_names.add(ent.text.strip())

names = [] 
# Display the extracted names, and convert these into title case
for name in extracted_names:
    names.append(name.title())

In [22]:
names

['Jonathan Kho',
 'Summer Intern',
 'Jordan',
 'Ben Cobb',
 '@/]\\B',
 'Asst',
 'Xusheng Wang',
 'James Fox',
 'Guangjun Xu',
 'Kemal Eren',
 'Mehmet Deveci',
 'Jonathan Kho',
 'Toulouse',
 'Doruk Bozdag',
 'Prabha Kumarasamy',
 'Metin',
 'Anas Abu-Doleh',
 'Catalyiirek',
 'Lingchen Xiong',
 'Catalytrek',
 'Umit Catalyurek',
 'Anne Benoit',
 'Onur Kucuktunc',
 'Shiv Kumar',
 'Mustafa Kemal Tas',
 'Catalytirek',
 'Olcay Sertel',
 'Kasimir Gabert',
 'S. Krishnamoorthy',
 'James Fox',
 'Zheng Zhou',
 'Julien Herrmann',
 'O. Bas',
 'M. Yusuf Ozkaya',
 'Zheng Zhou',
 'Senturk',
 'A. Kalyanaraman',
 'Muge Kural',
 'Tim Hartley',
 'P. Sadayappan',
 'Yusuf Ozkaya',
 'Erik Saule',
 'Catalyurek']

In [23]:
# now from these, let's classify the Turkish names, if the names are not made up AT least two words, then we will remove them

turkish_names = classify_names(names, vectorizer, clf)

for name in turkish_names:
    if len(name.split()) < 2:
        turkish_names.remove(name)


In [24]:
turkish_names

['Kemal Eren',
 'Mehmet Deveci',
 'Doruk Bozdag',
 'Onur Kucuktunc',
 'Mustafa Kemal Tas',
 'Olcay Sertel',
 'M. Yusuf Ozkaya',
 'Muge Kural',
 'Yusuf Ozkaya']