In [5]:
# üì¶ IMPORTS
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nest_asyncio
import asyncio
from googletrans import Translator

# üîß Patch asyncio pour environnement interactif (Jupyter, VSCode interactive, etc.)
nest_asyncio.apply()

# üõ† Configurer Tesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# üìÑ Chemin vers le PDF du CV
resume_path = r"C:\Users\khmir\Desktop\cvs\khmiri_iheb_tun_fr.pdf"

# üîç Description de poste (en anglais)
job_description = """
We are looking for a Data Scientist intern who has experience with Python, machine learning algorithms, and data analysis. 
The candidate will be responsible for data cleaning, feature engineering, and building predictive models.
"""

# üìå Fonction : extraire le texte OCR du PDF (en fran√ßais)
def extract_text_from_pdf_ocr(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        pix = page.get_pixmap(dpi=300)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        text = pytesseract.image_to_string(img, lang="fra")
        full_text += text + "\n"
    return full_text

# üìå Fonction principale async (traduction + matching)
async def main():
    # 1. OCR
    print("‚è≥ Lecture du CV via OCR...")
    cv_text_fr = extract_text_from_pdf_ocr(resume_path)

    # 2. Traduction
    print("üåê Traduction en anglais...")
    translator = Translator()
    translation = await translator.translate(cv_text_fr, src='fr', dest='en')
    cv_text_en = translation.text

    # 3. TF-IDF & Similarit√©
    print("üìä Calcul du matching...")
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([cv_text_en, job_description])
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    matching_percentage = cosine_sim[0][0] * 100

    print(f"\nüîç Matching Percentage: {matching_percentage:.2f}%\n")

    # 4. Analyse des mots-cl√©s
    cv_vector = tfidf_matrix[0].toarray()[0]
    job_vector = tfidf_matrix[1].toarray()[0]
    feature_names = vectorizer.get_feature_names_out()

    common_terms = []
    for idx, term in enumerate(feature_names):
        if cv_vector[idx] > 0 and job_vector[idx] > 0:
            avg_score = (cv_vector[idx] + job_vector[idx]) / 2
            common_terms.append((term, avg_score))

    # Trier et afficher
    common_terms_sorted = sorted(common_terms, key=lambda x: x[1], reverse=True)
    df = pd.DataFrame(common_terms_sorted, columns=["Term", "Avg_TFIDF_Weight"])
    print("üìå Matching Keywords:")
    print(df.head(10).to_string(index=False))

# ‚ñ∂Ô∏è Lancer la fonction dans un environnement interactif
await main()


‚è≥ Lecture du CV via OCR...
üåê Traduction en anglais...
üìä Calcul du matching...

üîç Matching Percentage: 21.41%

üìå Matching Keywords:
       Term  Avg_TFIDF_Weight
       data          0.366929
   learning          0.148246
    machine          0.148246
 algorithms          0.117122
engineering          0.117122
 experience          0.117122
     python          0.117122
   analysis          0.101561
   building          0.101561
 predictive          0.101561
