In [None]:
'''
Rec Engine
Notebook #4

This notebook installs the packages and modules that will be used for querying Qdrant to return the dataset matches.
The SentenceTransformer model is define as prior, with topic mappings set up the same.

The bottom portion of this notebook is the actual "querying" function - which obtains the users choice of domain, topic, and project type.
Based on the user's selections, the query selects the index rows, applies the embedding for a query vector, and obtains the potential datasets from Qdrant.
The results are then ranked by score obtained by the user's project choice.
'''

# Import packages and modules

In [None]:
!pip install qdrant_client

Collecting qdrant_client
  Downloading qdrant_client-1.14.2-py3-none-any.whl.metadata (10 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant_client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading qdrant_client-1.14.2-py3-none-any.whl (327 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.7/327.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, qdrant_client
Successfully installed portalocker-2.10.1 qdrant_client-1.14.2


In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
import pandas as pd
from qdrant_client import QdrantClient, models
from tqdm import tqdm

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

#Topic Mapping

In [None]:
topic_map = {
    'health': {
        'general health': [
            'medicine', 'hospital', 'insurance', 'nutrition', 'clinical',
            'diagnosis', 'drug', 'prescription'
            ],

        'mental health': [
            'depression', 'anxiety', 'therapy', 'ptsd', 'addiction', 'psychology',
            'psychiatry', 'music therapy', 'sleep'
            ],

        'public health': [
            'epidemiology', 'infectious disease', 'biostatistics', 'community health',
            'health economics', 'population health', 'social work'
            ],

        'medical fields': [
            'cardiology', 'endocrinology', 'gastroenterology', 'geriatrics',
            'hematology', 'hepatology', 'nephrology', 'neurology', 'oncology',
             'optometry', 'opthalmology' 'pediatrics', 'pulmonology', 'radiology',
            'rheumatology'
            ],

        'reproduction': [
            'gynecology', 'maternal fetal medicine', 'midwifery', 'obstetrics',
            'infertility', 'contraception', 'birth control']
        },

    'finance': {
        'personal finance': [
            'budgeting', 'credit score', 'debt management', 'student loan',
            'retirement saving', 'personal taxes', 'financial literacy'
            ],

        'corporate finance': [
            'capital structure', 'financial statements', 'mergers and acquisitions',
            'cost accounting', 'investment analysis'
            ],

        'public finance': [
            'government spending', 'public debt', 'tax policy', 'municipal budgets',
            'stimulus programs'
            ],

        'investment': [
            'stocks', 'cryptocurrency', 'bonds', 'ETFs', 'portfolio optimization',
            'trading algorithms', 'market risk', 'interest rates'
            ],

        'banking': [
            'financial institution', 'central banking', 'commercial banking',
            'regulations', 'lending practices', 'bank failures', 'mortages'
            ],

        'insurance and risk': [
            'health insurance', 'life insurance', 'property insurance',
            'actuarial models', 'risk assessment', 'premium pricing'
            ],

        'financial crime': [
            'money laundering', 'fraud detection', 'financial audits'
            ],

        'accounting': [
            'credit', 'accountant', 'taxes', 'budget', 'bookkeeping', 'equity'
            ],

        'real estate': [
            'mortgage', 'home loan'
            ],

        'economics': [
            'economy', 'scarcity', 'supply and demand', 'inflation', 'unemployment'
            ],

        'financial products': [
            'insurance', 'credit card', 'pension'
            ]
    },

    'infrastructure': {
        'transportation': [
            'vehicle', 'automobile', 'car', 'airplane', 'bus', 'transit', 'highway',
            'railway', 'bridge', 'tunnel'
            ],

        'utilities': [
            'sewage', 'electricity', 'water supply', 'trash collection',
            'garbage', 'recycling'
            ],

        'communication': [
            'internet', 'telephone', 'email', 'mail', 'broadcast', 'news',
            'television', 'radio'
            ],

        'energy': [
            'fossil fuel', 'coal', 'gasoline', 'oil', 'nuclear'
            ]
    },

    'environment': {
        'general environment': [
            'soil', 'ecology', 'biodiversity', 'pollution'
            ],

        'body of water': [
            'ocean', 'sea', 'lake', 'pond', 'river'
            ],

        'geology': [
            'magma', 'lava', 'igneous', 'metamorphic', 'sedimentary', 'rocks',
            'minerals'
            ],

        'weather': [
            'climate', 'meteorology', 'hurricane', 'tornado', 'volcano', 'thunder',
            'lightning'
            ]
    },

    'education': {
        'early childhood': [
            'preeschool', 'kindergarten readiness', 'childhood development'
            ],

        'K-12': [
            'standardized testing', 'curriculum', 'STEM education', 'reading levels',
            'school funding', 'teacher performance'
            ],

        'higher education': [
            'college enrollment', 'financial aid', 'student retention',
            'graduate', 'college ranking'
            ],

        'adult learning': [
            'continuing education', 'vocational training', 'lifelong learning'
            ],

        'educational equity': [
            'special education', 'disability services', '504 plan', 'IEP'
            ]
    },

    'government': {
        'public policy': [
            'social policy', 'economic policy', 'health policy', 'environmental policy',
            'education policy', 'housing policy', 'transportation policy'
            ],

        'government operations': [
            'budgets', 'procurement', 'public services', 'e-governace', 'bureaucracy'
            ],

        'elections': [
            'democracy', 'republic', 'voter turnout', 'campaign finance',
            'gerrymandering', 'voter registration', 'election fraud',
            'ballot initiatives', 'electoral systems'
            ],

        'legislation': [
            'bills', 'statutes', 'rulemaking', 'lobbying', 'compliance'
            ],

        'public safety': [
            'justice', 'law enforcement', 'criminal justice', 'incarceration',
            'prison', 'fire department', 'disaster response', 'emergency services'
            ]
    }
}

In [None]:
#topic mapping for querying
topic_list = []
for domain, topics in topic_map.items():
  for topic, subtopics in topics.items():
    for subtopic in subtopics:
      topic_list.append({
          'domain': domain,
          'topic': topic,
          'subtopic': subtopic
      })

#create dataframe of topic choices
topic_choices = pd.DataFrame(topic_list)
topic_choices['concat'] = topic_choices['domain'] + ' > ' + topic_choices['topic'] + ' > ' + topic_choices['subtopic']

#get vector embedding of the concatenated topic
vectors = model.encode([
    row.concat
    for row in topic_choices.itertuples()
], show_progress_bar=True)

#add vector embeddings to the dataframe
topic_choices['embedding'] = vectors.tolist()

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
#set up Qdrant client
QDRANT_URL = 'REDACTED'
QDRANT_API_KEY = 'REDACTED'
COLLECTION_NAME = 'dataset_recommender'

client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

# Querying

In [None]:
project_types = ['EDA', 'prediction', 'sentiment', 'trend', 'recommender',
                 'dashboard', 'pipeline', 'nlp']

In [None]:
project_map = {
    'EDA': 'EDA_score',
    'prediction': 'pred_norm',
    'sentiment': 'sentiment_norm',
    'trend': 'trend_norm',
    'recommender': 'rec_norm',
    'dashboard': 'Dashboard_score',
    'pipeline': 'pipeline_norm',
    'nlp': 'nlp_norm'
}

In [None]:
# ---User Selections ---
#domains
domains = sorted(topic_choices['domain'].unique())
print(f'\nAvailable Domains:')

#show domain options
for i, d in enumerate(domains):
  print(f'{i + 1}. {d}')

#save domain choice
domain_choice = domains[int(input('Select a domain (number): ')) - 1]

#get available topics, filtered by the domain choice
topics = sorted(topic_choices[topic_choices['domain'] == domain_choice]['topic'].unique())
print(f'\nAvailable Topics from {domain_choice}:')

#show topic options
for i, t in enumerate(topics):
  print(f'{i + 1}. {t}')

#save topic choice
topic_choice = topics[int(input('Select a topic (number): ')) - 1]

#projects
#project_types = ['EDA', 'prediction', 'sentiment', 'trend', 'recommender',
                 #'dashboard', 'pipeline', 'nlp']
print(f'\nProject Types:')

#show project types
for i, p in enumerate(project_map):
  print(f'{i + 1}. {p}')

#save project choice
project_choice = project_types[int(input('Select a project type (number): ')) - 1]
score_col = project_map[project_choice]

#get vector
selected_row = topic_choices[(topic_choices['domain'] == domain_choice) &
                             (topic_choices['topic'] == topic_choice)].iloc[0]
query_vector = selected_row['embedding']

#query qdrant
results = client.search(
    collection_name = 'dataset_recommender',
    query_vector=query_vector,
    limit=10,
    with_payload=True
)
sorted_results = sorted(results, key=lambda x: -x.payload.get(score_col, 0))

print(f'\nTop Dataset Matches:')
for i, res in enumerate(sorted_results):
  payload = res.payload
  print(f"\n{i + 1}. {payload.get('title', 'No title')}")
  print(f"Description: {payload.get('description', '')[:150]}...")
  print(f"Link: {payload.get('landing_page', 'No link')}")


Available Domains:
1. education
2. environment
3. finance
4. government
5. health
6. infrastructure
Select a domain (number): 5

Available Topics from health:
1. general health
2. medical fields
3. mental health
4. public health
5. reproduction
Select a topic (number): 3

Project Types:
1. EDA
2. prediction
3. sentiment
4. trend
5. recommender
6. dashboard
7. pipeline
8. nlp
Select a project type (number): 8

Top Dataset Matches:

1. VA-OHE-NVHER-FY13-Sociodemographic-Mental-Illness
Description: Summary level data from the National Veteran Health Equity Report - FY2013, filtered by mental illness....
Link: https://www.data.va.gov/d/rtmm-hjaz

2. Strategic Measure_Percentage of people who report 5 or more poor mental health days within the last 30 days
Description: This data is a summary of the prevalence of residents in Travis County who reported experiencing 5 or more days of poor mental health in the past 30 d...
Link: https://data.austintexas.gov/d/tncx-hyqy

3. SHIP Suicide Rate 2

  results = client.search(
