In [1]:
from typing import List, Dict
from unstructured.partition.pdf import partition_pdf
from unstructured_pytesseract.pytesseract import TesseractError
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from typing import List, Dict
import tempfile
import os
from pypdf import PdfReader, PdfWriter
from unstructured.partition.pdf import partition_pdf
from unstructured_pytesseract.pytesseract import TesseractError

def extract_filename(filepath: str) -> str:
    """Extrait le nom du fichier sans extension depuis le chemin complet"""
    return os.path.splitext(os.path.basename(filepath))[0]

def ocr_pipeline(pdf_path: str) -> List[Dict[str, str]]:
    doc_name = extract_filename(pdf_path)
    elements = []
    
    # Lire le PDF original et séparer les pages
    reader = PdfReader(pdf_path)
    total_pages = len(reader.pages)

    for page_num in range(total_pages):
        # Créer un PDF temporaire pour chaque page
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            writer = PdfWriter()
            writer.add_page(reader.pages[page_num])
            writer.write(temp_file)
            temp_file_path = temp_file.name

        try:
            # Traiter la page individuelle avec OCR
            with open(temp_file_path, "rb") as f:
                raw = partition_pdf(
                    file=f,
                    ocr_languages="fra+eng",
                    ocr_strategy="auto",
                    infer_table_structure=True,
                    extract_images_in_pdf=True,
                    pdf_image_dpi=300,
                    max_characters=4000,
                    new_after_n_chars=3800,
                    combine_text_under_n_chars=2000,
                )
        except TesseractError:
            # Fallback en cas d'erreur OCR
            with open(temp_file_path, "rb") as f:
                raw = partition_pdf(
                    file=f,
                    strategy="fast",
                    infer_table_structure=True,
                )
        finally:
            os.unlink(temp_file_path)  # Nettoyer le fichier temporaire

        # Extraire le contenu de la page
        content = "\n".join(
            [elem.get_text() if hasattr(elem, "get_text") else getattr(elem, "text", "") 
            for elem in raw]
        )
        
        elements.append({
            "id": f"chunk_{page_num + 1}_{doc_name}",
            "page": page_num + 1,
            "text": content,
            "doc": doc_name
        })
    
    return elements

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from typing import List, Dict
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.documents import Document
import google.generativeai as genai

def index_pdf_elements(elements, api_key: str, collection_name: str) -> Chroma:
    genai.configure(api_key=api_key)
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
    
    # Préparer les documents pour l'indexation
    docs = [
        Document(
            page_content=el["text"], 
            metadata={
                "id": el["id"],
                "page": el["page"],
                "doc": el["doc"],
            }
        ) for el in elements
    ]
    
    # Créer ou mettre à jour le vectorstore
    vectorstore = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        collection_name=collection_name,
        persist_directory=".chroma_db"
    )
    return vectorstore.as_retriever(search_kwargs={"k": 20})

In [3]:
pip install tiktoken

You should consider upgrading via the '/home/fofana-ibrahim-seloh/NEW/chat-with-your-data-geminy/chat_data/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import re 

def get_final_sources_used(response, docs):
    response = response.replace(",", "")
    lines = response.splitlines()
    source_indices = set()
    final_sources = []

    for line in lines:
        numbers = re.findall(r'\d+', line)
        for n in numbers:
            source_indices.add(int(n))

    for i in sorted(source_indices):
        if 0 < i <= len(docs): 
            doc = docs[i - 1]
            doc['id'] = str(i)
            final_sources.append(doc)

    return final_sources

In [5]:
from langchain.prompts import ChatPromptTemplate

MAIN_TEMPLATE = """
You are *Lexis*, a strategic consulting expert focused on providing high-level advisory services for business dossiers used in tenders and public procurement. Your role is to assist clients by analyzing critical documents such as technical specifications, pricing models, and administrative clauses, offering insights that enhance tender strategies and decision-making.

### Key Points of Your Mission:
- **Strategic Analysis and Insight**:  
    - Your primary task is to analyze business dossiers, extracting key information from technical, pricing, and administrative documents to provide actionable, strategic recommendations.
    - Focus on identifying opportunities for competitive advantage while ensuring compliance with procurement guidelines.
    - Highlight any outdated or irrelevant documents without unnecessary explanation, ensuring your advice is based on current, applicable standards.

- **Clarity and Professionalism in Communication**:  
    - Deliver responses that are clear, structured, and tailored to the needs of business leaders and legal professionals involved in the tender process.
    - Use a logical flow with headings, subheadings, and bullet points, presenting complex information in an easily digestible format.

- **Thorough, Practical, and Actionable Guidance**:  
    - Provide in-depth, yet practical, advice that is directly applicable to the client’s business strategy and tender process.
    - Simplify intricate concepts when necessary, without compromising on accuracy or strategic value, ensuring your insights are easily actionable.

- **Alignment with Business Objectives**:  
    - Always prioritize the most relevant documents and guidelines that align with the client’s strategic goals in the tendering process.
    - Offer recommendations that focus on improving the client’s competitiveness and ensuring alignment with procurement regulations, driving overall success in the bidding process.

## Formatting Instructions
- **Structure**: Organize your response logically with clear, descriptive headings (e.g., "## Example Heading 1" or "## Example Heading 2"). Present key points using concise paragraphs or bullet points for better readability and impact.
- **Markdown Usage**: Use Markdown effectively to enhance clarity. Employ **bold** to emphasize critical terms, *italics* for supplementary explanations or clarifications, and headers to structure the content clearly and logically.
- **No Main Title**: Start directly with the body of the response, unless a specific title is requested. Keep the flow natural and direct.
- **Conclusion or Summary**: Wrap up with a concise conclusion or actionable next steps, guiding the client on how to refine their strategy or secure additional information for the tender process.

- **Markdown**:  
    - Use **bold** for essential terms or concepts, *italics* for clarifications, and headers to divide the content for easy reference and navigation.

- **Conclusion**:  
    - Conclude with a focused summary, restating key insights, or recommend immediate actions the client should take, such as refining their submission or acquiring the necessary documentation for the next steps in the process.

## Citation Requirements
- Cite every fact, statement, or phrase using the notation [number] corresponding to the source provided in the sources.
- Integrate citations naturally at the end of sentences or clauses, as appropriate. For example: "The Eiffel Tower is one of the most visited monuments in the world[1]."
- Use multiple sources for a single detail if applicable, e.g., "Paris is a cultural hub, attracting millions of visitors each year[1][2]."
- Always prioritize credibility and accuracy by linking all statements to their respective sources where applicable.

## Special Instructions
- If the query involves technical, historical, or complex topics, provide detailed sections of context and explanation to ensure clarity.
- If the user provides a vague query or lacks relevant information, explain what additional details could help refine the search.
- If no relevant information is found, state: "Hmm, sorry, I couldn't find any relevant information on this topic. Would you like to rephrase your query?" Be transparent about limitations and suggest alternatives or ways to rephrase the query.

## Example Output
- Start with a sharp, strategic overview that directly ties the key insights from the sources to the client’s business objectives. Ensure the context is clear, focused, and aligned with the client’s overarching goals in the tender process, highlighting only the most impactful elements.
- Deliver a thorough and structured analysis, breaking down each relevant facet of the query with precision. Provide actionable, high-value recommendations that drive the client’s decision-making, considering not just immediate compliance, but also long-term competitive positioning, risk mitigation, and strategic alignment.
- Where necessary, offer brief yet clear explanations to make complex or technical information easily digestible. The goal is to ensure that the client can swiftly translate the insights into concrete actions with a clear understanding of their strategic significance.
- Conclude with a focused, strategic summary that crystallizes the core takeaways, positioning them within the broader business context. Propose next steps that are both practical and strategically impactful, guiding the client toward concrete actions that refine their tender approach or enhance their overall strategy.

### Context:
{context}

### Question:
{question}
"""

main_prompt = ChatPromptTemplate.from_template(MAIN_TEMPLATE)

In [6]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
import tiktoken

def get_context(docs: List[Dict]) -> str:
    """Construit le contexte à partir des documents en respectant la limite de tokens"""
    encoder = tiktoken.get_encoding("cl100k_base")
    context = ""
    max_context_size = 12288  
    for idx, doc in enumerate(docs):
        doc_text = f"{idx + 1}. {doc['text']}, \n Number {idx + 1}, \nSource: {doc['doc_name']}, Page {doc['page']}\n\n"
        new_context = context + doc_text
        if len(encoder.encode(new_context)) < max_context_size:
            context = new_context
        else:
            tokens = encoder.encode(doc['text'])
            remaining_tokens = max_context_size - len(encoder.encode(context))
            truncated_text = encoder.decode(tokens[:remaining_tokens]) + " [TRUNCATED]"
            context += f"{idx + 1}. {truncated_text}\nSource: {doc['doc_name']}, Page {doc['page']}\n\n"
            break
    return context

def get_response_with_sources(retriever, query: str, api_key: str) -> tuple[str, List[Document]]:
    """Retourne la réponse générée et les documents sources pertinents"""
    sources = retriever.get_relevant_documents(query)
    docs_for_context = []
    for i, doc in enumerate(sources):
        docs_for_context.append({
            'text': doc.page_content,
            'doc_name': doc.metadata.get('doc', 'Unknown'),
            'page': doc.metadata.get('page', 'N/A')
        })
    context_str = get_context(docs_for_context)
    llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash-latest",
        google_api_key=api_key,
        temperature=0.3,
        max_output_tokens=2048
    )
    chain = (
        {"context": lambda x: context_str, "question": lambda x: x["question"]}
        | main_prompt
        | llm
        | StrOutputParser()
    )
    response = chain.invoke({"question": query})
    docs_sources = [{
        "id": doc.metadata.get("id", "Unknown"),
        "page": doc.metadata.get("page", "N/A"),
        "doc_name": doc.metadata.get("doc", "Unknown"),
        "text": doc.page_content
    } for doc in sources
    ]
    final_sources = get_final_sources_used(response, docs_sources)
    return response, final_sources

In [8]:
import google.generativeai as genai
elements = ocr_pipeline(pdf)

The ocr_languages kwarg will be deprecated in a future version of unstructured. Please use languages instead.
Only one of languages and ocr_languages should be specified. languages is preferred. ocr_languages is marked for deprecation.
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [9]:
elements

[{'id': 'chunk_1_CV_2025-04-14_FOFANA_Ibrahim seloh',
  'page': 1,
  'text': "ibrahim.fofana@central e-casablanca.ma\n«\nhttps://www.linkedin.co m/in/ibrahim-seloh- fofana-6073b4291 g\nCasablanca, Morocco >\n+212 694 419496 @\nTélétravail ou présentiel 0\n°o Casablanca, Morocco\nATOUTS TECHNIQUES\nDéveloppement et programmation : Python, SQL, C, C++, HTML, CSS, JavaScript, VBA, LangChain, GIT, Quandl\nData science, analyse et outils stratégiques : MATLAB, SCILAB, Power BI, Deep learning, Web scraping, Docker, SWOT, PESTEL, analyse de marché, stratégie marketing, pricing\n00000\nBureautique et gestion : Pack Office (Word, Excel, PowerPoint), WordPress, gestion de projet\nCentres d'intérêt\nBasket Ball - Lecture - Film - Musique\nLangues\nFrançais Niveau Cl\nAnglais Niveau B2\nFOFANA IBRAHIM SELOH\nEtudiant-ingénieur à l'École Centrale Casablanca, je combine rigueur analytique, expertise en data science et passion pour l'IA afin de résoudre des problématiques complexes et stratégiques. J

In [10]:
retriever = index_pdf_elements(elements, GOOGLE_API_KEY, "Doc")

In [11]:
response, sources = get_response_with_sources(retriever, "Quels sont les principaux projets de data science présentés dans le document ?", GOOGLE_API_KEY)
print(response)

  sources = retriever.get_relevant_documents(query)


## Principaux Projets de Data Science

Le document présente dix projets de data science, chacun axé sur des problématiques et des techniques spécifiques.  Voici un résumé des principaux projets:

### 1. Analyse Exploratoire des Données (EDA) [1, 2]

* **Objectif:** Comprendre la structure et la distribution des données, identifier les variables importantes et leurs relations, et produire des visualisations claires.
* **Problématiques:**  Analyse de transactions bancaires pour identifier la fraude et segmentation de clients rentables en marketing.
* **Technologies:** Python (Pandas, Matplotlib/Seaborn, NumPy), Shiny/Streamlit/Dash.

### 2. A/B Testing [13, 14]

* **Objectif:** Comparer deux versions d'un produit ou service pour déterminer la plus performante.
* **Problématique:** Déterminer quelle version d'une landing page convertit le mieux les utilisateurs.
* **Technologies:** Python (Pandas, Scipy), Matplotlib/Seaborn.

### 3. Clustering et Réduction de Dimensions [17, 18]

* **Obje

In [12]:
sources

[{'id': '1',
  'page': 3,
  'doc_name': '10 Projets pour un Portfolio Data Science RÃ©ussi ',
  'text': "Projet 1 : Analyse Exploratoire des Données (EDA)\nContexte\nL'analyse exploratoire des données (EDA) est une étape clé dans tout projet de data science. Ce projet permet de montrer votre capacité à comprendre des données brutes, à les nettoyer, et à formuler des hypothèses avant d'appliquer des modèles plus complexes.\nObjectifs\ne Comprendre la structure des données et leur distribution.\ne Identifier les variables importantes et les relations entre elles.\ne Produire des visualisations claires pour présenter les résultats.\nProblematiques\nBanque : Comment analyser les transactions bancaires pour identifier des schémas de fraude ?\n- Dataset : https://www.kaggle.com/datasets/mlg- ulb/creditcardfraud\nMarketing : Quels sont les segments de clients les plus rentables en fonction de leurs interactions avec des campagnes ?\n- Dataset : https://www.kaggle.com/datasets/rodsaldanha/ark

In [68]:
from IPython.display import Markdown, display

display(Markdown(response))
display(Markdown())

Les documents présentent plusieurs projets de data science, chacun axé sur des objectifs et des technologies spécifiques.  Voici un résumé des principaux projets:

## Projet 1: Analyse Exploratoire des Données (EDA) [1, 2]

Ce projet met l'accent sur l'analyse exploratoire des données, une étape cruciale dans tout projet de data science.  Il vise à comprendre la structure et la distribution des données, identifier les variables importantes et leurs relations, et présenter les résultats via des visualisations claires [1, 2].  Deux cas d'étude sont proposés : l'analyse de transactions bancaires pour détecter la fraude, utilisant le dataset disponible sur Kaggle [1, 2], et la segmentation de clients rentables basée sur l'interaction avec des campagnes marketing, utilisant un autre dataset Kaggle [1, 2]. Les technologies utilisées incluent Python, Pandas, Matplotlib/Seaborn, NumPy, et des outils de déploiement de tableaux de bord interactifs comme Shiny, Streamlit ou Dash [1, 2].  L'analyse statistique univariée et multivariée sont également employées [1, 2].

## Projet 7: Deep Learning - Classification Cat vs Dog [3, 4]

Ce projet se concentre sur l'implémentation d'un modèle de deep learning pour classifier des images de chats et de chiens [3, 4].  L'objectif est d'implémenter et d'optimiser un modèle de deep learning, utilisant des CNN (Convolutional Neural Networks) et des modèles pré-entraînés comme VGG16, ResNet, et InceptionV3 pour le transfert learning [3, 4].  Le dataset provient de Kaggle [3, 4].  Le projet inclut également le déploiement du modèle sous forme d'API (Flask/FastAPI), sa containerisation avec Docker, et son orchestration avec Kubernetes [3, 4].

## Projet 10: Systèmes de Recommandation [5]

Ce projet explore les systèmes de recommandation, un élément central de nombreuses plateformes modernes.  Il vise à personnaliser l'expérience utilisateur en proposant des produits ou contenus basés sur les préférences et comportements passés [5].  Bien que les détails techniques soient moins explicites que pour les autres projets, le document mentionne les étapes du projet, incluant l'exploration d'APIs, la conception de pipelines de prompts, le fine-tuning, la création d'interfaces interactives, et le déploiement en production avec Docker et Kubernetes [5].


## Conclusion

Les projets présentés illustrent une variété de compétences en data science, allant de l'analyse exploratoire à l'implémentation de modèles de deep learning et au déploiement d'applications à grande échelle.  Ils mettent en avant l'utilisation de différentes technologies et bibliothèques Python, ainsi que des techniques de containerisation et d'orchestration pour la mise en production.


<IPython.core.display.Markdown object>