In [3]:
import os
os.chdir("../")

In [10]:
import os
from PyPDF2 import PdfReader
from docx import Document
import chardet
import requests
from bs4 import BeautifulSoup

links = [
    "https://www.worldofagri.com/2021/01/chemical-fertilizers-detailed-guide.html?m=1",
    "https://desertstudiescenter.uoanbar.edu.iq/News_Details.php?ID=192",
    "https://www.twinkl.com.eg/teaching-wiki/alnbatat-alsamt",
    "https://safiadecor.com/%D8%AA%D8%B9%D8%B1%D9%81-%D8%B9%D9%84%D9%89-%D8%A7%D9%84%D9%86%D8%A8%D8%A7%D8%AA%D8%A7%D8%AA-%D8%A7%D9%84%D8%B3%D8%A7%D9%85%D8%A9-%D8%A7%D9%84%D8%AF%D8%A7%D8%AE%D9%84%D9%8A%D8%A9-%D9%88-%D8%B7%D8%B1/",
    "https://hundzsoilegypt.com/%D9%83%D9%85-%D9%8A%D8%AD%D8%AA%D8%A7%D8%AC-%D8%A7%D9%84%D9%81%D8%AF%D8%A7%D9%86-%D9%85%D9%86-%D8%A7%D9%84%D8%B3%D9%85%D8%A7%D8%AF-%D8%A7%D9%84%D8%B9%D8%B6%D9%88%D9%8A/",
    "https://tajagri.sa/blogs/%D8%A3%D8%B3%D9%85%D8%AF%D8%A9-%D8%B2%D8%B1%D8%A7%D8%B9%D9%8A%D8%A9-%D8%B3%D8%A7%D8%A6%D9%84%D9%87?srsltid=AfmBOoolllLyN--6i6xP33kTziSecL52qWnUTS020wJO_3F__D3H33n6",
    "https://konyaseker.com.tr/ar/icerik/detay/6293/ar",
    "https://tajagri.sa/blogs/%D8%B7%D8%B1%D9%8A%D9%82%D8%A9-%D8%A7%D8%B3%D8%AA%D8%AE%D8%AF%D8%A7%D9%85-%D8%A7%D9%84%D8%B3%D9%85%D8%A7%D8%AF-%D8%A7%D9%84%D8%B3%D8%A7%D8%A6%D9%84-%D9%84%D9%84%D9%86%D8%A8%D8%A7%D8%AA%D8%A7%D8%AA-%D8%A7%D9%84%D8%AF%D8%A7%D8%AE%D9%84%D9%8A%D8%A9?srsltid=AfmBOoo2bTOr8DW1zTSvYhrXrAgc-7NOB4T1G38ig4-BrOewzL50KjVU"
]

def extract_text_from_pdf(file_path):
    text = ""
    try:
        with open(file_path, "rb") as f:
            reader = PdfReader(f)
            for page in reader.pages:
                text += page.extract_text() or ""
    except Exception as e:
        print(f"[Error] Failed to read PDF {file_path}: {e}")
    return text

def extract_text_from_txt(file_path):
    try:
        with open(file_path, 'rb') as f:
            raw = f.read()
            encoding = chardet.detect(raw)['encoding']
            return raw.decode(encoding)
    except Exception as e:
        print(f"[Error] Failed to read TXT file {file_path}: {e}")
        return ""

def extract_text_from_docx(file_path):
    try:
        doc = Document(file_path)
        return '\n'.join(para.text for para in doc.paragraphs)
    except Exception as e:
        print(f"[Error] Failed to read DOCX file {file_path}: {e}")
        return ""

def extract_text_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        for tag in soup(['script', 'style']):
            tag.decompose()
        return soup.get_text(separator="\n", strip=True)
    except Exception as e:
        print(f"[Error] Failed to extract from URL {url}: {e}")
        return ""

def extract_all_text_from_directory(directory_path):
    all_text = ""

    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if filename.lower().endswith(".pdf"):
            print(f"[PDF] Reading: {filename}")
            all_text += f"\n--- PDF: {filename} ---\n"
            try:
                with open(file_path, "rb") as f:
                    reader = PdfReader(f)
                    clean_name = filename.strip().lower()
                    start_page = 60 if clean_name == "الافات والامراض النباتية الجزء الثاني ٢٠٠٣.pdf" else 0
                    for i in range(start_page, len(reader.pages)):
                        all_text += reader.pages[i].extract_text() or ""
            except Exception as e:
                print(f"[Error] Failed to read PDF {file_path}: {e}")
            all_text += "\n"
        elif filename.lower().endswith(".txt"):
            print(f"[TXT] Reading: {filename}")
            all_text += f"\n--- TXT: {filename} ---\n"
            all_text += extract_text_from_txt(file_path) + "\n"
        elif filename.lower().endswith(".docx"):
            print(f"[DOCX] Reading: {filename}")
            all_text += f"\n--- DOCX: {filename} ---\n"
            all_text += extract_text_from_docx(file_path) + "\n"
            
    for url in links:
        print(f"[URL] Scraping: {url}")
        all_text += f"\n--- URL: {url} ---\n"
        all_text += extract_text_from_url(url) + "\n"

    return all_text



In [11]:
extracted_data = extract_all_text_from_directory(directory_path='Data/')
with open("extracted_output.txt", "w", encoding="utf-8") as f:
    f.write(extracted_data)
print("✅ All text extracted and saved to 'extracted_output.txt'")

[TXT] Reading: pests_text.txt
[PDF] Reading: الافات والامراض النباتية الجزء الثاني ٢٠٠٣.pdf
[DOCX] Reading: النبات الطبي.docx
[DOCX] Reading: امراض النبات.docx
[URL] Scraping: https://www.worldofagri.com/2021/01/chemical-fertilizers-detailed-guide.html?m=1
[URL] Scraping: https://desertstudiescenter.uoanbar.edu.iq/News_Details.php?ID=192
[URL] Scraping: https://www.twinkl.com.eg/teaching-wiki/alnbatat-alsamt
[URL] Scraping: https://safiadecor.com/%D8%AA%D8%B9%D8%B1%D9%81-%D8%B9%D9%84%D9%89-%D8%A7%D9%84%D9%86%D8%A8%D8%A7%D8%AA%D8%A7%D8%AA-%D8%A7%D9%84%D8%B3%D8%A7%D9%85%D8%A9-%D8%A7%D9%84%D8%AF%D8%A7%D8%AE%D9%84%D9%8A%D8%A9-%D9%88-%D8%B7%D8%B1/
[URL] Scraping: https://hundzsoilegypt.com/%D9%83%D9%85-%D9%8A%D8%AD%D8%AA%D8%A7%D8%AC-%D8%A7%D9%84%D9%81%D8%AF%D8%A7%D9%86-%D9%85%D9%86-%D8%A7%D9%84%D8%B3%D9%85%D8%A7%D8%AF-%D8%A7%D9%84%D8%B9%D8%B6%D9%88%D9%8A/
[URL] Scraping: https://tajagri.sa/blogs/%D8%A3%D8%B3%D9%85%D8%AF%D8%A9-%D8%B2%D8%B1%D8%A7%D8%B9%D9%8A%D8%A9-%D8%B3%D8%A7%D8%A6%D9%84%D9%

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    text_chunks = text_splitter.split_text(extracted_data)  
    return text_chunks

In [13]:
text_chunks = text_split(extracted_data)
print("length of text chunks :",len(text_chunks))

length of text chunks : 1511


In [21]:
import openai
from pathlib import Path
import os
from dotenv import load_dotenv
import time

env_path = Path("D:/Artificial intelligence/nlp/plants_chatpot/.env")
load_dotenv(env_path)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
openai.api_key = OPENAI_API_KEY

def get_embedding(text: str, model="text-embedding-3-small", max_retries: int = 3):
        for attempt in range(max_retries):
            try:
                text = text.replace("\n", " ")
                response = openai.embeddings.create(
                    input=[text],
                    model=model
                )
                return response.data[0].embedding
            except Exception as e:
                if attempt == max_retries - 1:
                    print(f"Final error getting embedding: {e}")
                    return None
                print(f"Error getting embedding (attempt {attempt + 1}): {e}")
                time.sleep(2 ** attempt)
                
                

In [None]:
from typing import List
import numpy as np
import faiss
import json

INDEX_DIR = r'D:\Artificial intelligence\nlp\plants_chatpot\index'
index_path = Path(INDEX_DIR) / 'index.faiss'
chunks_path = Path(INDEX_DIR) / 'chunks.json'

def build_index(chunks: List[str]):
        embeddings_list = []

        for chunk in chunks:
            embedding = get_embedding(chunk)
            if embedding:
                embeddings_list.append(embedding)
                time.sleep(0.1)
        
        if not embeddings_list:
            raise ValueError("No valid embeddings were generated")
            
        embeddings_array = np.array(embeddings_list).astype('float32')
        dimension = len(embeddings_list[0])
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings_array)
        faiss.write_index(index, str(index_path))
        with open(chunks_path, 'w') as f:
            json.dump(chunks, f)
        
        print(f"Index built successfully")

In [23]:
build_index(chunks=text_chunks)

Index built successfully
