In [9]:
# üì¶ IMPORTS
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nest_asyncio
import asyncio
from googletrans import Translator

# üîß Patch asyncio pour environnement interactif (Jupyter, VSCode interactive, etc.)
nest_asyncio.apply()

# üõ† Configurer Tesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# üìÑ Chemin vers le PDF du CV
resume_path = r"C:\Users\khmir\Desktop\cvs\khmiri_iheb_tun_fr.pdf"

# üîç Description de poste (en anglais)
job_description = """
We are looking for a Data Scientist intern who has experience with Python, machine learning algorithms, and data analysis. 
The candidate will be responsible for data cleaning, feature engineering, and building predictive models.
"""

# üìå Fonction : extraire le texte OCR du PDF (en fran√ßais)
def extract_text_from_pdf_ocr(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        pix = page.get_pixmap(dpi=300)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        text = pytesseract.image_to_string(img, lang="fra")
        full_text += text + "\n"
    return full_text

# üìå Fonction principale async (traduction + matching)
async def main():
    # 1. OCR
    print("‚è≥ Lecture du CV via OCR...")
    cv_text_fr = extract_text_from_pdf_ocr(resume_path)

    # 2. Traduction
    print("üåê Traduction en anglais...")
    translator = Translator()
    translation = await translator.translate(cv_text_fr, src='fr', dest='en')
    cv_text_en = translation.text

    # 3. TF-IDF & Similarit√©
    print("üìä Calcul du matching...")
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([cv_text_en, job_description])
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    matching_percentage = cosine_sim[0][0] * 100

    print(f"\nüîç Matching Percentage: {matching_percentage:.2f}%\n")

    # 4. Analyse des mots-cl√©s
    cv_vector = tfidf_matrix[0].toarray()[0]
    job_vector = tfidf_matrix[1].toarray()[0]
    feature_names = vectorizer.get_feature_names_out()

    common_terms = []
    for idx, term in enumerate(feature_names):
        if cv_vector[idx] > 0 and job_vector[idx] > 0:
            avg_score = (cv_vector[idx] + job_vector[idx]) / 2
            common_terms.append((term, avg_score))

    # Trier et afficher
    common_terms_sorted = sorted(common_terms, key=lambda x: x[1], reverse=True)
    df = pd.DataFrame(common_terms_sorted, columns=["Term", "Avg_TFIDF_Weight"])
    print("üìå Matching Keywords:")
    print(df.head(100).to_string(index=False))

# ‚ñ∂Ô∏è Lancer la fonction dans un environnement interactif
await main()


‚è≥ Lecture du CV via OCR...
üåê Traduction en anglais...
üìä Calcul du matching...

üîç Matching Percentage: 21.41%

üìå Matching Keywords:
       Term  Avg_TFIDF_Weight
       data          0.366929
   learning          0.148246
    machine          0.148246
 algorithms          0.117122
engineering          0.117122
 experience          0.117122
     python          0.117122
   analysis          0.101561
   building          0.101561
 predictive          0.101561


In [2]:
import pandas as pd
import re
import spacy
from spacy.matcher import PhraseMatcher
from skillNer.skill_extractor_class import SkillExtractor
from dateparser.search import search_dates
from datetime import datetime
from langdetect import detect
import geonamescache
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import warnings
import joblib
import json
import random
import pytesseract
from pdf2image import convert_from_path
import os

# ========== Initialisation ==========
warnings.simplefilter(action='ignore', category=FutureWarning)
nlp = spacy.load('en_core_web_lg')
gc = geonamescache.GeonamesCache()
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# === Base de comp√©tences
SKILL_DB_PATH = r"C:\Users\khmir\Desktop\data-science\skill_db_relax_20.json"
with open(SKILL_DB_PATH, 'r', encoding='utf-8') as f:
    SKILL_DB = json.load(f)

skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

# ========== Fonctions NLP de base ==========
def count_verbs(sentence): return len([t for t in nlp(sentence) if t.pos_ == 'VERB'])
def count_adjectives(sentence): return len([t for t in nlp(sentence) if t.pos_ == 'ADJ'])
def count_stopwords(sentence): return len([t for t in nlp(sentence) if t.is_stop])
def count_nouns(sentence): return len([t for t in nlp(sentence) if t.pos_ == 'NOUN'])
def count_digits(sentence): return len([t for t in nlp(sentence) if t.is_digit])
def count_special_characters(sentence): return len([t for t in nlp(sentence) if not t.text.isalnum() and not t.is_punct])
def count_punctuation(sentence): return len([t for t in nlp(sentence) if t.is_punct])
def calculate_sentence_length(sentence): return len(nlp(sentence))

# ========== OCR Extraction PDF ==========
def extract_text_from_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    all_text = ""
    for image in images:
        all_text += pytesseract.image_to_string(image, lang='eng+fra') + "\n"
    temp_txt_path = pdf_path.replace('.pdf', '.txt')
    with open(temp_txt_path, 'w', encoding='utf-8') as f:
        f.write(all_text)
    return temp_txt_path

# ========== D√©tection des comp√©tences ==========
def extract_skills(skill_extractor, sentence):
    try:
        annotations = skill_extractor.annotate(sentence)
        unique_values = set()
        for item in annotations['results']['full_matches']:
            unique_values.add(item['doc_node_value'].lower())
        for item in annotations['results']['ngram_scored']:
            unique_values.add(item['doc_node_value'].lower())
        return list(unique_values)
    except Exception as e:
        print(f"Erreur comp√©tences: {e}")
        return []

def count_skills(skill_extractor, sentence):
    return len(extract_skills(skill_extractor, sentence))

def detectSkills(skill_extractor, file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        cc = f.read()
    annotations = skill_extractor.annotate(cc)
    unique_values = set()
    for item in annotations['results']['full_matches']:
        skill = item['doc_node_value'].lower()
        unique_values.add(' '.join(dict.fromkeys(skill.split())))
    for item in annotations['results']['ngram_scored']:
        skill = item['doc_node_value'].lower()
        unique_values.add(' '.join(dict.fromkeys(skill.split())))
    unique_values.discard('')
    return list(unique_values)

# ========== Exp√©rience et nettoyage ==========
def calculate_total_years_experience(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    lines = content.splitlines()
    total_years = 0
    printed_lines = set()

    for line in lines:
        original_line = line
        line = line.lower()
        line = re.sub(r"\b(months?|years?|mos|yr|yrs|mois|an|ans)\b", "", line, flags=re.IGNORECASE)
        line = line.replace(".", "").replace("/", " ").replace("-", " ")
        for kw in ["present", "today", "now", "aujourd'hui"]:
            line = line.replace(kw, datetime.now().strftime("%b %d, %Y"))

        parsed_date = search_dates(line, languages=["fr", "en"])
        if parsed_date:
            parsed_dates = [date[1] for date in parsed_date]
            if len(parsed_dates) >= 2:
                parsed_dates.sort()
                date1, date2 = parsed_dates[:2]
                diff_years = (date2.year - date1.year) + (date2.month - date1.month) / 12.0
                total_years += diff_years
                printed_lines.add(original_line)
                print(f"[INFO] Dates d√©tect√©es : {original_line}")
    return round(total_years, 2)

def detect_location(text, locations):
    return [loc for loc in locations if re.search(r'\b' + re.escape(loc) + r'\b', text, re.IGNORECASE)]

def detect_address(file_path):
    countries = [country['name'] for country in gc.get_countries().values()]
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return detect_location(content, countries)

def clean_ref(ref):
    cleaned_ref = re.sub(r'\[|\]|\s+|DCE', '', ref, flags=re.IGNORECASE)
    match = re.search(r'Ref(\d+)', cleaned_ref, re.IGNORECASE)
    return f"ref{match.group(1)}" if match else "ref not found"

# ========== Machine Learning ==========
def train_dataset():
    dataset = pd.read_excel('dataset_final.xlsx')
    dataset = dataset.drop(dataset[(dataset['IsExperience'] == 'YES') & ((dataset['Sentence length'] < 3) | (dataset['Sentence length'] > 28))].index)
    dataset = dataset.drop(dataset[(dataset['IsExperience'] == 'YES') & (dataset['experiences'].str.contains("\\?"))].index)

    numeric_features = ['Verbs number', 'Adjectives number', 'Stopwords number', 'Sentence length', 'Nouns number', 'Special chars number', 'Punctuation number', 'Digits number', 'Skills number']
    numeric_transformer = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=2))])
    categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    global preprocessor
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, ['experiences'])
    ])

    X = dataset.drop('IsExperience', axis=1)
    y = LabelEncoder().fit_transform(dataset['IsExperience'])
    X_transformed = preprocessor.fit_transform(X)

    classifier = RandomForestClassifier()
    classifier.fit(X_transformed, y)
    joblib.dump(classifier, 'random_forest_model.pkl')
    joblib.dump(preprocessor, 'preprocessor.pkl')

def predict(filepath):
    classifier = joblib.load('random_forest_model.pkl')
    preprocessor = joblib.load('preprocessor.pkl')

    with open(filepath, 'r', encoding='utf-8') as file:
        sentences = file.readlines()

    data_list = []
    for sentence in sentences:
        data_list.append(pd.DataFrame({
            'experiences': [sentence],
            'Verbs number': [count_verbs(sentence)],
            'Adjectives number': [count_adjectives(sentence)],
            'Stopwords number': [count_stopwords(sentence)],
            'Sentence length': [calculate_sentence_length(sentence)],
            'Nouns number': [count_nouns(sentence)],
            'Special chars number': [count_special_characters(sentence)],
            'Punctuation number': [count_punctuation(sentence)],
            'Digits number': [count_digits(sentence)],
            'Skills number': [count_skills(skill_extractor, sentence)]
        }))

    input_df = pd.concat(data_list, ignore_index=True)
    X_input = preprocessor.transform(input_df)
    predictions = classifier.predict(X_input)
    predicted_as_experience = input_df[predictions == 1]

    #print("\nüìå Exp√©riences identifi√©es:")
    if predicted_as_experience.empty:
        print("Aucune phrase d'exp√©rience identifi√©e.")
    else:
        for index, row in predicted_as_experience.iterrows():
            print(f"- {row['experiences']}")

    return [s for s, pred in zip(sentences, predictions) if pred == 1]

# ========== Exemple d'utilisation ==========
if __name__ == "__main__":
    RESUME_PATH = r"C:\Users\khmir\Downloads\khmiri_iheb_tun_eng.pdf"
    txt_path = extract_text_from_pdf(RESUME_PATH)

    skills = detectSkills(skill_extractor, txt_path)
    experiences = predict(txt_path)
    duration = calculate_total_years_experience(txt_path)
    countries = detect_address(txt_path)

    print("\n========== R√©sultats de l'analyse du CV ==========")
    print("Comp√©tences d√©tect√©es:", ", ".join(skills))
    #print(f"\nDur√©e totale d'exp√©rience estim√©e: {duration} ans")
    #print("\nPays d√©tect√©s:", ", ".join(countries) if countries else "Aucun")


loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
- * Design, testing, and integration of dashboards into a web applica-

- * Development of an intelligent system to extract key information

- * Design of a chatbot connected to the Data Warehouse for interac-

- Technologies: Python, NLP, Airflow, Microsoft Power BI, MongoDB,

- Technologies: Python, EDA, Airflow, MongoDB, PostgreSQL, Docker,

- * Data collection and preprocessing through web scraping.

- * Audit, validation, and deployment of the dashboard to a website.

- Technologies: Python, EDA, SQL Workbench, Talend, Microsoft

[INFO] Dates d√©tect√©es : February 2025 - August 2025
[INFO] Dates d√©tect√©es : June 2024 - August 2024
[INFO] Dates d√©tect√©es : January 2024 ‚Äî June 2024
[INFO] Dates d√©tect√©es : September 2020 - Present
[INFO] Dates d√©tect√©es : September 2019 - July 2020

Comp√©tences d√©tect√©es: intelligent system, audit, angula

and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
  parsed_date = search_dates(line, languages=["fr", "en"])
