In [3]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

# Load spaCy's English NLP model
nlp = spacy.load("en_core_web_sm")

# Load your dataset
df = pd.read_csv('dataset.csv')  # Replace with your dataset file path

# Preprocessing function
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in STOP_WORDS and token.text not in string.punctuation]
    return " ".join(tokens)

# Apply preprocessing to the dataset
df['processed_text'] = df['Title'].apply(preprocess_text)  # Assuming 'abstract' is a column in your dataset

In [5]:
def extract_medical_terms(text):
    doc = nlp(text)
    medical_terms = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ['diabetes', 'hypertension', 'fever']]
    return medical_terms

# Example of extracting terms from processed text
df['medical_terms'] = df['processed_text'].apply(extract_medical_terms)

In [6]:
def generate_boolean_query(user_input):
    terms = user_input.split(',')
    boolean_query = " AND ".join([term.strip() for term in terms])
    return boolean_query

# Example usage
user_input = "diabetes, hypertension, fever"
boolean_query = generate_boolean_query(user_input)
print("Generated Boolean Query:", boolean_query)

Generated Boolean Query: diabetes AND hypertension AND fever


In [7]:
def refine_query(original_query):
    # Placeholder for query refinement logic (e.g., expansion or rewriting)
    refined_query = original_query.replace("AND", "OR")  # Example transformation
    return refined_query

refined_query = refine_query(boolean_query)
print("Refined Query:", refined_query)

Refined Query: diabetes OR hypertension OR fever


In [4]:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
import os

# Step 1: Set up schema for indexing
schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))

# Step 2: Create an index directory (if not already exists)
index_dir = "indexdir"
if not os.path.exists(index_dir):
    os.mkdir(index_dir)

# Step 3: Create the index
ix = create_in(index_dir, schema)
writer = ix.writer()

# Step 4: Add some documents to the index
documents = [
    {"title": "Document 1", "content": "This is the content of the first document."},
    {"title": "Document 2", "content": "This document discusses Python and search engines."},
    {"title": "Document 3", "content": "Another document that mentions Python programming."}
]

for doc in documents:
    writer.add_document(title=doc["title"], content=doc["content"])
writer.commit()

# Step 5: Function to perform a search
def search(query_str):
    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse(query_str)
        results = searcher.search(query)
        for result in results:
            print(f"Title: {result['title']}\nContent: {result['content']}\n")

# Example of running a search
search("Python")


Title: Document 2
Content: This document discusses Python and search engines.

Title: Document 3
Content: Another document that mentions Python programming.



In [5]:
import pandas as pd

DATA_DIR = r"K:\hackathon\New folder"
COVID_DATA_FILE = "dataset.csv"
file_path = os.path.join(DATA_DIR, COVID_DATA_FILE)

try:
    df = pd.read_csv(file_path)
    print(df.columns)
except (FileNotFoundError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
    print(f"Error loading '{file_path}': {e}")


Index(['Rank', 'NCT Number', 'Title', 'Acronym', 'Status', 'Study Results',
       'Conditions', 'Interventions', 'Outcome Measures',
       'Sponsor/Collaborators', 'Gender', 'Age', 'Phases', 'Enrollment',
       'Funded Bys', 'Study Type', 'Study Designs', 'Other IDs', 'Start Date',
       'Primary Completion Date', 'Completion Date', 'First Posted',
       'Results First Posted', 'Last Update Posted', 'Locations',
       'Study Documents', 'URL'],
      dtype='object')


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request, render_template
import os
import requests
from functools import lru_cache
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# --- Configuration ---
DATA_DIR = r"K:\hackathon\New folder"  # Use a raw string (r"...") for Windows paths
COVID_DATA_FILE = "dataset.csv" # Replace with your actual data file
ICD_API_URL = "https://icd.who.int/icdapi"
DEFAULT_SEARCH_TYPE = "research"

# ICD API credentials (environment variables - KEEP THESE SECURE!)
CLIENT_ID = os.environ.get("YOUR_CLIENT_ID") 
CLIENT_SECRET = os.environ.get("YOUR_CLIENT_SECRET")

# --- Data Loading and Preprocessing ---
def load_data(filepath):
    try:
        df = pd.read_csv(filepath)
        if df.empty:
            raise ValueError(f"Data file '{filepath}' is empty.")
        return df
    except (FileNotFoundError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
        print(f"Error loading '{filepath}': {e}")
        return None

covid_df = load_data(os.path.join(DATA_DIR, COVID_DATA_FILE))

# --- NLTK Initialization ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token not in stop_words]
    return " ".join(tokens)

if covid_df is not None:
    text_col = next((col for col in covid_df.columns if col in ('Title', 'Conditions')), None)
    if text_col:
        covid_df['processed_text'] = covid_df[text_col].apply(preprocess_text)
    else:
        print("No suitable text column ('Title' or 'Conditions') found in dataset.")
        exit()

# --- 2. Query Processing and Boolean Generation ---
@lru_cache(maxsize=1)  # Cache the access token
def get_icd_access_token():
    url = f"{ICD_API_URL}/Token"
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    data = {
        "client_id": CLIENT_ID,
        "client_secret": CLIENT_SECRET,
        "grant_type": "client_credentials"
    }
    try:
        response = requests.post(url, headers=headers, data=data)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        token_data = response.json()
        return token_data.get("access_token")
    except requests.exceptions.RequestException as e:
        print(f"Error during access token retrieval: {e}")
        return None

def get_icd_codes(query):
    token = get_icd_access_token()
    if token is None:
        print("Error: Could not obtain ICD access token.")
        return []

    url = f"{ICD_API_URL}/GetICD11CodeInfo"
    headers = {
        "Accept": "application/json",
        "API-Version": "v2",
        "Authorization": f"Bearer {token}"
    }
    params = {"q": query, "useFlexisearch": "true"}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        icd_codes = [item["code"] for item in data.get("linearizations", []) if "code" in item]
        return icd_codes
    except requests.exceptions.RequestException as e:
        print(f"Error in ICD API request: {e}")
        return []

def generate_boolean_query(user_query, search_type=DEFAULT_SEARCH_TYPE):
    tokens = nltk.word_tokenize(user_query)
    keywords = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token not in stop_words]
    icd_codes = get_icd_codes(user_query)
    keywords.extend(icd_codes)
    return " AND ".join(keywords)

# --- 3. Search and Retrieval ---
def search_data(query_string, data_source):
    if data_source == "research" and covid_df is not None and 'processed_text' in covid_df.columns:
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(covid_df['processed_text'])
        query_vec = vectorizer.transform([query_string])
        similarity_scores = cosine_similarity(query_vec, tfidf_matrix)

        top_indices = similarity_scores[0].argsort()[::-1]
        results = covid_df.iloc[top_indices].head(50)  # Get top 50 results
    else:
        results = pd.DataFrame()  # Return empty dataframe if dataset is not loaded.

    return results

# --- 4. Flask App ---
app = Flask(__name__)

@app.route('/', methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        query = request.form.get('query', '').strip()
        if not query:
            error_message = "Please enter a search query"
            return render_template('index.html', error_message=error_message) # Added error handling for index.html too.

        search_type = request.form.get('search_type', DEFAULT_SEARCH_TYPE)
        boolean_query = generate_boolean_query(query, search_type)
        results = search_data(boolean_query, search_type)

        if results.empty:
            no_results_message = "No results found for your query."
            return render_template('results.html', message=no_results_message, query=query, num_results=0)

        return render_template('results.html', tables=[results.to_html(classes='data')], query=query, num_results=len(results))

    return render_template('index.html') # Corrected template name

if __name__ == '__main__':
    app.run(debug=True)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\koven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\koven\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\koven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\koven/nltk_data'
    - 'c:\\Users\\koven\\AppData\\Local\\Programs\\Python\\Python39\\nltk_data'
    - 'c:\\Users\\koven\\AppData\\Local\\Programs\\Python\\Python39\\share\\nltk_data'
    - 'c:\\Users\\koven\\AppData\\Local\\Programs\\Python\\Python39\\lib\\nltk_data'
    - 'C:\\Users\\koven\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request, render_template
import os
import requests
from functools import lru_cache
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Set NLTK data path
nltk.data.path.append(os.path.join(os.getenv("HOME"), "nltk_data"))

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# --- Configuration ---
DATA_DIR = r"K:\hackathon\New folder"  # Use a raw string (r"...") for Windows paths
COVID_DATA_FILE = "dataset.csv" # Replace with your actual data file
ICD_API_URL = "https://icd.who.int/icdapi"
DEFAULT_SEARCH_TYPE = "research"

# ICD API credentials (environment variables - KEEP THESE SECURE!)
CLIENT_ID = os.environ.get("3982eddd-016d-4d15-9728-7df1fe003c09_ee47cff8-cfa5-4110-96c2-6839fdaa28fa")  # Do NOT put credentials directly in the code
CLIENT_SECRET = os.environ.get("0hQwn9J1AR4NLUfkdSC5YvaTkYCqMTWfsavTafXdPLQ=")

# --- Data Loading and Preprocessing ---
def load_data(filepath):
    try:
        df = pd.read_csv(filepath)
        if df.empty:
            raise ValueError(f"Data file '{filepath}' is empty.")
        return df
    except (FileNotFoundError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
        print(f"Error loading '{filepath}': {e}")
        return None

covid_df = load_data(os.path.join(DATA_DIR, COVID_DATA_FILE))

# --- NLTK Initialization ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token not in stop_words]
    return " ".join(tokens)

if covid_df is not None:
    text_col = next((col for col in covid_df.columns if col in ('Title', 'Conditions')), None)
    if text_col:
        covid_df['processed_text'] = covid_df[text_col].apply(preprocess_text)
    else:
        print("No suitable text column ('Title' or 'Conditions') found in dataset.")
        exit()

# --- 2. Query Processing and Boolean Generation ---
@lru_cache(maxsize=1)  # Cache the access token
def get_icd_access_token():
    url = f"{ICD_API_URL}/Token"
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    data = {
        "client_id": CLIENT_ID,
        "client_secret": CLIENT_SECRET,
        "grant_type": "client_credentials"
    }
    try:
        response = requests.post(url, headers=headers, data=data)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        token_data = response.json()
        return token_data.get("access_token")
    except requests.exceptions.RequestException as e:
        print(f"Error during access token retrieval: {e}")
        return None

def get_icd_codes(query):
    token = get_icd_access_token()
    if token is None:
        print("Error: Could not obtain ICD access token.")
        return []

    url = f"{ICD_API_URL}/GetICD11CodeInfo"
    headers = {
        "Accept": "application/json",
        "API-Version": "v2",
        "Authorization": f"Bearer {token}"
    }
    params = {"q": query, "useFlexisearch": "true"}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        icd_codes = [item["code"] for item in data.get("linearizations", []) if "code" in item]
        return icd_codes
    except requests.exceptions.RequestException as e:
        print(f"Error in ICD API request: {e}")
        return []

def generate_boolean_query(user_query, search_type=DEFAULT_SEARCH_TYPE):
    tokens = nltk.word_tokenize(user_query)
    keywords = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token not in stop_words]
    icd_codes = get_icd_codes(user_query)
    keywords.extend(icd_codes)
    return " AND ".join(keywords)

# --- 3. Search and Retrieval ---
def search_data(query_string, data_source):
    if data_source == "research" and covid_df is not None and 'processed_text' in covid_df.columns:
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(covid_df['processed_text'])
        query_vec = vectorizer.transform([query_string])
        similarity_scores = cosine_similarity(query_vec, tfidf_matrix)

        top_indices = similarity_scores[0].argsort()[::-1]
        results = covid_df.iloc[top_indices].head(50)  # Get top 50 results
    else:
        results = pd.DataFrame()  # Return empty dataframe if dataset is not loaded.

    return results

# --- 4. Flask App ---
app = Flask(__name__)

@app.route('/', methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        query = request.form.get('query', '').strip()
        if not query:
            error_message = "Please enter a search query"
            return render_template('index.html', error_message=error_message) # Added error handling for index.html too.

        search_type = request.form.get('search_type', DEFAULT_SEARCH_TYPE)
        boolean_query = generate_boolean_query(query, search_type)
        results = search_data(boolean_query, search_type)

        if results.empty:
            no_results_message = "No results found for your query."
            return render_template('results.html', message=no_results_message, query=query, num_results=0)

        return render_template('results.html', tables=[results.to_html(classes='data')], query=query, num_results=len(results))

    return render_template('index.html') # Corrected template name

if __name__ == '__main__':
    app.run(debug=True)


TypeError: expected str, bytes or os.PathLike object, not NoneType

In [None]:
import pandas as pd

DATA_DIR = r"K:\hackathon\New folder"
COVID_DATA_FILE = "dataset.csv"
file_path = os.path.join(DATA_DIR, COVID_DATA_FILE)

try:
    df = pd.read_csv(file_path)
    print(df.columns)
except (FileNotFoundError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
    print(f"Error loading '{file_path}': {e}")


Index(['Rank', 'NCT Number', 'Title', 'Acronym', 'Status', 'Study Results',
       'Conditions', 'Interventions', 'Outcome Measures',
       'Sponsor/Collaborators', 'Gender', 'Age', 'Phases', 'Enrollment',
       'Funded Bys', 'Study Type', 'Study Designs', 'Other IDs', 'Start Date',
       'Primary Completion Date', 'Completion Date', 'First Posted',
       'Results First Posted', 'Last Update Posted', 'Locations',
       'Study Documents', 'URL'],
      dtype='object')


In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request, render_template
import os
import requests
from functools import lru_cache
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Set NLTK data path
nltk.data.path.append(os.path.join(os.path.expanduser("~"), "nltk_data"))

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# --- Configuration ---
DATA_DIR = r"K:\hackathon\New folder"  # Use a raw string (r"...") for Windows paths
COVID_DATA_FILE = "dataset.csv" # Replace with your actual data file
ICD_API_URL = "https://icd.who.int/icdapi"
DEFAULT_SEARCH_TYPE = "research"

# ICD API credentials (environment variables - KEEP THESE SECURE!)
CLIENT_ID = os.environ.get("YOUR_CLIENT_ID")  # Do NOT put credentials directly in the code
CLIENT_SECRET = os.environ.get("YOUR_CLIENT_SECRET")

# --- Data Loading and Preprocessing ---
def load_data(filepath):
    try:
        df = pd.read_csv(filepath)
        if df.empty:
            raise ValueError(f"Data file '{filepath}' is empty.")
        return df
    except (FileNotFoundError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
        print(f"Error loading '{filepath}': {e}")
        return None

covid_df = load_data(os.path.join(DATA_DIR, COVID_DATA_FILE))

# --- NLTK Initialization ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token not in stop_words]
    return " ".join(tokens)

if covid_df is not None:
    text_col = next((col for col in covid_df.columns if col in ('Title', 'Conditions')), None)
    if text_col:
        covid_df['processed_text'] = covid_df[text_col].apply(preprocess_text)
    else:
        print("No suitable text column ('Title' or 'Conditions') found in dataset.")
        exit()

# --- 2. Query Processing and Boolean Generation ---
@lru_cache(maxsize=1)  # Cache the access token
def get_icd_access_token():
    url = f"{ICD_API_URL}/Token"
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    data = {
        "client_id": CLIENT_ID,
        "client_secret": CLIENT_SECRET,
        "grant_type": "client_credentials"
    }
    try:
        response = requests.post(url, headers=headers, data=data)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        token_data = response.json()
        return token_data.get("access_token")
    except requests.exceptions.RequestException as e:
        print(f"Error during access token retrieval: {e}")
        return None

def get_icd_codes(query):
    token = get_icd_access_token()
    if token is None:
        print("Error: Could not obtain ICD access token.")
        return []

    url = f"{ICD_API_URL}/GetICD11CodeInfo"
    headers = {
        "Accept": "application/json",
        "API-Version": "v2",
        "Authorization": f"Bearer {token}"
    }
    params = {"q": query, "useFlexisearch": "true"}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        icd_codes = [item["code"] for item in data.get("linearizations", []) if "code" in item]
        return icd_codes
    except requests.exceptions.RequestException as e:
        print(f"Error in ICD API request: {e}")
        return []

def generate_boolean_query(user_query, search_type=DEFAULT_SEARCH_TYPE):
    tokens = nltk.word_tokenize(user_query)
    keywords = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token not in stop_words]
    icd_codes = get_icd_codes(user_query)
    keywords.extend(icd_codes)
    return " AND ".join(keywords)

# --- 3. Search and Retrieval ---
def search_data(query_string, data_source):
    if data_source == "research" and covid_df is not None and 'processed_text' in covid_df.columns:
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(covid_df['processed_text'])
        query_vec = vectorizer.transform([query_string])
        similarity_scores = cosine_similarity(query_vec, tfidf_matrix)

        top_indices = similarity_scores[0].argsort()[::-1]
        results = covid_df.iloc[top_indices].head(50)  # Get top 50 results
    else:
        results = pd.DataFrame()  # Return empty dataframe if dataset is not loaded.

    return results

# --- 4. Flask App ---
app = Flask(__name__)

@app.route('/', methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        query = request.form.get('query', '').strip()
        if not query:
            error_message = "Please enter a search query"
            return render_template('index.html', error_message=error_message) # Added error handling for index.html too.

        search_type = request.form.get('search_type', DEFAULT_SEARCH_TYPE)
        boolean_query = generate_boolean_query(query, search_type)
        results = search_data(boolean_query, search_type)

        if results.empty:
            no_results_message = "No results found for your query."
            return render_template('results.html', message=no_results_message, query=query, num_results=0)

        return render_template('results.html', tables=[results.to_html(classes='data')], query=query, num_results=len(results))

    return render_template('index.html') # Corrected template name

if __name__ == '__main__':
    app.run(debug=True)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\koven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\koven\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\koven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\koven/nltk_data'
    - 'c:\\Users\\koven\\AppData\\Local\\Programs\\Python\\Python39\\nltk_data'
    - 'c:\\Users\\koven\\AppData\\Local\\Programs\\Python\\Python39\\share\\nltk_data'
    - 'c:\\Users\\koven\\AppData\\Local\\Programs\\Python\\Python39\\lib\\nltk_data'
    - 'C:\\Users\\koven\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:\\Users\\koven\\nltk_data'
**********************************************************************


In [None]:
import os
import requests
from functools import lru_cache

# ICD API credentials (environment variables - KEEP THESE SECURE!)
CLIENT_ID = os.environ.get("YOUR_CLIENT_ID") 
CLIENT_SECRET = os.environ.get("YOUR_CLIENT_SECRET") # Replace with your actual client secret
ICD_API_URL = "https://icd.who.int/icdapi"

@lru_cache(maxsize=1)  # Cache the access token
def get_icd_access_token():
    url = f"{ICD_API_URL}/Token"
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    data = {
        "client_id": CLIENT_ID,
        "client_secret": CLIENT_SECRET,
        "grant_type": "client_credentials"
    }
    try:
        response = requests.post(url, headers=headers, data=data)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        token_data = response.json()
        return token_data.get("access_token")
    except requests.exceptions.RequestException as e:
        print(f"Error during access token retrieval: {e}")
        return None

def get_icd_codes(query):
    token = get_icd_access_token()
    if token is None:
        print("Error: Could not obtain ICD access token.")
        return []

    url = f"{ICD_API_URL}/GetICD11CodeInfo"
    headers = {
        "Accept": "application/json",
        "API-Version": "v2",
        "Authorization": f"Bearer {token}"
    }
    params = {"q": query, "useFlexisearch": "true"}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        icd_codes = [item["code"] for item in data.get("linearizations", []) if "code" in item]
        return icd_codes
    except requests.exceptions.RequestException as e:
        print(f"Error in ICD API request: {e}")
        return []

if __name__ == '__main__':
    query = "diabetes"  # Replace with your search query
    icd_codes = get_icd_codes(query)
    print("ICD Codes:", icd_codes)


Error during access token retrieval: 404 Client Error: Not Found for url: https://icd.who.int/icdapi/Token
Error: Could not obtain ICD access token.
ICD Codes: []


In [None]:
import os
import requests
from functools import lru_cache

# ICD API credentials (environment variables - KEEP THESE SECURE!)
CLIENT_ID = os.environ.get("YOUR_CLIENT_ID") 
CLIENT_SECRET = os.environ.get("YOUR_CLIENT_SECRET") # Replace with your actual client secret
ICD_API_URL = "https://icd.who.int/icdapi"

@lru_cache(maxsize=1)  # Cache the access token
def get_icd_access_token():
    url = f"{ICD_API_URL}/Token"
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    data = {
        "client_id": CLIENT_ID,
        "client_secret": CLIENT_SECRET,
        "grant_type": "client_credentials"
    }
    try:
        response = requests.post(url, headers=headers, data=data)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        token_data = response.json()
        return token_data.get("access_token")
    except requests.exceptions.RequestException as e:
        print(f"Error during access token retrieval: {e}")
        return None

def get_icd_codes(query):
    token = get_icd_access_token()
    if token is None:
        print("Error: Could not obtain ICD access token.")
        return []

    url = f"{ICD_API_URL}/GetICD11CodeInfo"
    headers = {
        "Accept": "application/json",
        "API-Version": "v2",
        "Authorization": f"Bearer {token}"
    }
    params = {"q": query, "useFlexisearch": "true"}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        icd_codes = [item["code"] for item in data.get("linearizations", []) if "code" in item]
        return icd_codes
    except requests.exceptions.RequestException as e:
        print(f"Error in ICD API request: {e}")
        return []

if __name__ == '__main__':
    query = "diabetes"  # Replace with your search query
    icd_codes = get_icd_codes(query)
    print("ICD Codes:", icd_codes)


Error during access token retrieval: 404 Client Error: Not Found for url: https://icd.who.int/icdapi/Token
Error: Could not obtain ICD access token.
ICD Codes: []


In [None]:
from pymongo import MongoClient
import os

# MongoDB connection string
mongo_uri = "mongodb+srv://bejistojoseph801:mongodb2003@cluster0.aq48n.mongodb.net/"

# Create a MongoClient
client = MongoClient(mongo_uri)

# Get the database (replace 'your_database' with the actual database name)
db_name = "medical"  
db = client[db_name]

# Print the collections in the database
print(f"Collections in database '{db_name}':")
for collection_name in db.list_collection_names():
    print(collection_name)


ModuleNotFoundError: No module named 'pymongo'

In [12]:
import streamlit as st
from pymongo import MongoClient
import spacy
import pandas as pd

# Load spacy model
nlp = spacy.load("en_core_web_sm")

# MongoDB connection string
mongo_uri = "mongodb+srv://bejistojoseph801:mongodb2003@cluster0.aq48n.mongodb.net/"
client = MongoClient(mongo_uri)

# Check connection
try:
    client.admin.command('ping')
    st.success("Connected to MongoDB successfully!")
except Exception as e:
    st.error(f"Failed to connect to MongoDB: {e}")

# Verify database and collections
db = client["medical"]
collections = db.list_collection_names()
st.write("Available Collections in 'medical' Database:", collections)

# Preview data in each collection
for collection_name in collections:
    st.write(f"Previewing data from collection: {collection_name}")
    collection = db[collection_name]
    sample_data = list(collection.find().limit(5))  # Fetch 5 records for preview
    if sample_data:
        df_sample_data = pd.DataFrame(sample_data)
        st.dataframe(df_sample_data)
    else:
        st.write(f"No data found in {collection_name} collection.")


ModuleNotFoundError: No module named 'streamlit'

In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\koven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True