# **Prediction of Genetic Associations in ALS Through NLP and Complex Network Analysis**

This research project aims to predict candidate genes in Amyotrophic Lateral Sclerosis (ALS) using Natural Language Processing (NLP) and complex network analysis techniques.

- **Author:** João Pedro Viguini T. T. Correa  
- **Supervisor:** Prof. Dr. Ricardo Cerri

This research is supported by FAPESP (2025/06512-0)


# Package Installation

- **[IMPORTANT]** Please ensure that all necessary files are downloaded from the GitHub repository and uploaded to your Google Drive.

- The steps below detail the installation process for the required packages and models used in this study.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -r ./drive/MyDrive/IC_2025/requirements.txt

Collecting numpy==1.26.4 (from -r ./drive/MyDrive/IC_2025/requirements.txt (line 1))
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy==3.7.5 (from -r ./drive/MyDrive/IC_2025/requirements.txt (line 2))
  Downloading spacy-3.7.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting beautifulsoup4==4.13.4 (from -r ./drive/MyDrive/IC_2025/requirements.txt (line 4))
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting biopython==1.85 (from -r ./drive/MyDrive/IC_2025/requirements.txt (line 5))
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting fasttext==0.9.3 (f

Install spaCy models for NER and tokenization --> It should be inside your Google Drive

In [None]:
!pip install --upgrade drive/MyDrive/IC_2025/en_ner_bionlp13cg_md-0.5.4.tar.gz --no-deps

Processing ./drive/MyDrive/IC_2025/en_ner_bionlp13cg_md-0.5.4.tar.gz
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: en_ner_bionlp13cg_md
  Building wheel for en_ner_bionlp13cg_md (setup.py) ... [?25l[?25hdone
  Created wheel for en_ner_bionlp13cg_md: filename=en_ner_bionlp13cg_md-0.5.4-py3-none-any.whl size=119814705 sha256=0bc22cf48855631d47cdb126527dbdefc6e2f4b131bea7268a4041437a8e022d
  Stored in directory: /root/.cache/pip/wheels/c9/fb/8b/4bbf308c03bde2232c00f76c958d4e7bcf7b4f2874c9b2159c
Successfully built en_ner_bionlp13cg_md
Installing collected packages: en_ner_bionlp13cg_md
Successfully installed en_ner_bionlp13cg_md-0.5.4


In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.2/38.2 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.16.2
    Uninstalling scipy-1.16.2:
      Successfully 

# Import


**Restart the session before running this cell.**

In [None]:
import re
import time
import urllib
import numpy as np
import pandas as pd
from Bio import Entrez
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from tqdm import tqdm

try:
    import mygene
    MYGENE_AVAILABLE = True
except Exception:
    print("mygene not available")
    MYGENE_AVAILABLE = False

# Configs

In [None]:
Entrez.email = "jpvviguini@gmail.com" # replace
API_KEY = "7edcb0657d8ffc045a7eec1068abad863b09"   # replace
YEAR_START = 2000
YEAR_END = 2010
MAX_ARTICLES = 100 # how many articles you want to retrieve
SLEEP_TIME = 0.37
MAX_RETRIES = 3
CHUNK_SIZE = 200
VALIDATE_WITH_MYGENE = True
MYGENE_BATCH_SIZE = 1000


BASE_QUERY = (
    '("amyotrophic lateral sclerosis"[tiab] OR "motor neuron disease"[tiab] OR MND[tiab] OR ALS[tiab]) AND '
    '("gene"[tiab] OR "genes"[tiab] OR genetic[tiab] OR mutation*[tiab] OR polymorphism*[tiab] OR "Genome-Wide Association Study"[Mesh] OR GWAS[tiab])'

)
# )
# BASE_QUERY = (
#   '( "gene[tiab]" OR "genes[tiab]" OR genetic[tiab] '
#   'OR mutation*[tiab] OR polymorphism*[tiab] '
#   'OR variant*[tiab] OR SNP[tiab] OR SNPs[tiab] '
#   'OR loci[tiab] OR locus[tiab] '
#   'OR GWAS[tiab] OR "genome-wide association"[tiab] '
#   'OR expression[tiab] ) '
#   'AND '
#   '( association*[tiab] OR relationship*[tiab] '
#   'OR correlation*[tiab] OR interaction*[tiab] '
#   'OR linkage[tiab] OR "risk factor*"[tiab] '
#   'OR susceptib*[tiab] OR regulat*[tiab] )'
# )


# Loading NLP models

In [None]:
print("Loading NLP models...")
nlp_ner = None

# spaCy NER
try:
    nlp_ner = spacy.load("en_ner_bionlp13cg_md", disable=["tagger", "parser"])
except Exception as e:
    print("Warning: could not load en_ner_bionlp13cg_md (NER). Error:", e)



Loading NLP models...


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


# Preprocessing functions

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s\-]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text.lower()



GENE_STOPWORDS = set([
    "THE", "AND", "WITH", "FOR", "WAS", "WERE", "ARE", "OUR", "FROM",
    "THIS", "THAT", "THAN", "DISEASE", "PATIENT", "PATIENTS", "GENETIC", "RISK",
    "STUDY", "GENE", "GENES", "ANALYSIS", "RESULT", "RESULTS", "DATA", "MODEL",
    "MODELS", "TYPE", "CASE", "CASES", "ALS", "LATERAL", "SCLEROSIS", "MOTOR",
    "NEURON", "DNA", "RNA", "PROTEIN", "CELL", "CELLS", "TISSUE", "BRAIN",
    "NEURONS", "MOUSE", "MICE",

    "OF", "IN", "TO", "ON", "BY", "AS", "AN", "OR", "IS", "BE", "WE",
    "NOT", "THESE", "HAVE", "HAS", "WITHIN", "FOUND", "US", "INCREASE", "IMPACT"
])


# regex patterns as a fallback
REGEX_PATTERNS = [
    r"\bC\d+ORF\d+\b",           # ex: C9ORF72
    r"\bRS\d{3,9}\b",              # SNP ids: rs123456
    r"\b[A-Z]{2,4}-\d{1,3}\b",    # ex: ABC-1, TDP-43
    r"\b[A-Z]{3,6}[0-9]{0,3}\b"  # ex: SOD1, TP53
]


# Gene extraction using NER

In [None]:
def extract_genes_unbiased(text):
    """
    - Extract gene candidates from a text (NER when available and regex as fallback).

    - Receive the original text (with capital letters/punctuation) to maximize NER retrieval.

    - Return an ordered list of gene-like symbols/tokens in capital letters.
    """

    if not text:
        return []
    text_str = str(text)
    text_upper = text_str.upper()
    genes = set()

    # NER in the original text
    if nlp_ner is not None:
        try:
            doc = nlp_ner(text_str)
            for ent in doc.ents:
                label = getattr(ent, 'label_', '')

                # NER labels (depends on the model)
                if 'GENE' in label.upper() or 'PROTEIN' in label.upper() or 'GENE_PRODUCT' in label.upper():
                    norm = re.sub(r'[^A-Za-z0-9]', '', ent.text)
                    norm_up = norm.upper()
                    if 3 <= len(norm_up) <= 10 and norm_up not in GENE_STOPWORDS: # filter stopwords
                        genes.add(norm_up)
        except Exception:

            # if NER fails, ignore it and continue with regex
            pass


    # regex patterns
    for pattern in REGEX_PATTERNS:
        for match in re.findall(pattern, text_upper):

            # filter stopwords and tokens too short
            if match and match not in GENE_STOPWORDS and len(re.sub(r'[^A-Z0-9]', '', match)) >= 3:
                genes.add(match)


    # removes tokens that are only numbers
    cleaned = set()
    for g in genes:
        if re.search(r'[A-Z]', g):
            cleaned.add(g)
    return sorted(cleaned)


# validation with mygene
def validate_genes_with_mygene(candidate_genes):
    """
    Validates a list of symbols using mygene (batch). Returns a set of validated symbols.

    """
    if not MYGENE_AVAILABLE:
        print("mygene not available; skipping validation.")
        return set()

    mg = mygene.MyGeneInfo()
    validated = set()
    candidates = list(candidate_genes)

    for i in range(0, len(candidates), MYGENE_BATCH_SIZE):
        batch = candidates[i:i+MYGENE_BATCH_SIZE]
        try:
            res = mg.querymany(batch, scopes=['symbol', 'alias', 'name'], fields='symbol,taxid', species='human', entrezonly=False)
            for r in res:
                # r can signal notfound
                if r is None:
                    continue

                if isinstance(r, dict) and not r.get('notfound', False):
                    sym = r.get('symbol')
                    taxid = r.get('taxid')

                    # human (taxid 9606) or None (some results doesn't have taxid)
                    if sym and (taxid is None or int(taxid) == 9606):
                        validated.add(sym.upper())
        except Exception as e:
            print(f"mygene query batch failed: {e}")

            # in case of error, we just continue
            continue

    return validated


# Pubmed article collection

In [None]:

def safe_read_abstract(article):
    try:
        art = article['MedlineCitation']['Article']
        abstract_field = art.get('Abstract')
        if not abstract_field:
            return ''
        abstract_text = abstract_field.get('AbstractText')
        if not abstract_text:
            return ''
        abstract_parts = []
        for a in abstract_text:
            if isinstance(a, dict):
                txt = a.get('#text') or a.get('label') or a.get('Label') or ''
                abstract_parts.append(str(txt))
            else:
                abstract_parts.append(str(a))
        return ' '.join([p for p in abstract_parts if p])
    except Exception:
        return ''

def get_als_genetic_articles(query, start_year, end_year, max_articles=20000):
    all_articles = []
    try:
        handle = Entrez.esearch(
            db="pubmed",
            term=query,
            retmax=0,
            mindate=str(start_year),
            maxdate=str(end_year),
            datetype="pdat",
            api_key=API_KEY
        )
        result = Entrez.read(handle)
        handle.close()
        total = int(result.get("Count", 0))
        print(f"Found {total} articles between {start_year}-{end_year}")

        if total == 0:
            return []

        for retstart in range(0, min(total, max_articles), 10000):
            for retry in range(MAX_RETRIES):
                try:
                    handle = Entrez.esearch(
                        db="pubmed",
                        term=query,
                        retmax=10000,
                        retstart=retstart,
                        mindate=str(start_year),
                        maxdate=str(end_year),
                        datetype="pdat",
                        api_key=API_KEY
                    )
                    search_result = Entrez.read(handle)
                    handle.close()
                    id_list = search_result.get("IdList", [])

                    for i in tqdm(range(0, len(id_list), CHUNK_SIZE)):
                        batch = id_list[i:i+CHUNK_SIZE]
                        fetch_handle = Entrez.efetch(
                            db="pubmed",
                            id=batch,
                            retmode="xml",
                            api_key=API_KEY
                        )
                        try:
                            data = Entrez.read(fetch_handle)
                        except Exception:
                            data = {}
                        finally:
                            fetch_handle.close()

                        for article in data.get('PubmedArticle', []):
                            try:
                                title = article['MedlineCitation']['Article'].get('ArticleTitle', '')
                                abstract = safe_read_abstract(article)
                                pmid = str(article['MedlineCitation']['PMID'])
                                text = f"{title} {abstract}".strip()
                                all_articles.append({
                                    "pmid": pmid,
                                    "title": title,
                                    "abstract": abstract,
                                    "text": text
                                })
                            except KeyError:
                                continue
                        time.sleep(SLEEP_TIME)
                    break
                except Exception as e:
                    print(f"Attempt {retry+1} failed: {e}")
                    time.sleep(2 ** retry)
    except Exception as e:
        print(f"Fatal error in PubMed query: {e}")
    return all_articles

# Useful functions

- For training models and calculating the ranking

In [None]:
import os
import fasttext
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from collections import defaultdict
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

def get_embeddings(gene_list, model, model_type):

    """
      gets embeddings for gene list depending on the model
    """

    # convert genes to lowercase for embedding lookup
    lowercase_genes = [g.lower() for g in gene_list]

    if model_type == 'fasttext':
        # filter genes that exist in fasttext vocabulary
        valid_genes = [g for g in lowercase_genes if model.get_word_id(g) != -1]
        embeddings = np.array([model.get_word_vector(g) for g in valid_genes])

        # return original case genes with their embeddings
        original_case_genes = [gene_list[i] for i, g in enumerate(lowercase_genes) if g in valid_genes]
        return embeddings, original_case_genes

    elif model_type == 'word2vec':
        valid_genes = [g for g in lowercase_genes if g in model.wv]
        embeddings = np.array([model.wv[g] for g in valid_genes])

        original_case_genes = [gene_list[i] for i, g in enumerate(lowercase_genes) if g in valid_genes]
        return embeddings, original_case_genes

    else:
        raise ValueError(f"Unsupported model type: {model_type}")


def calculate_ranking_cosine(genes, model, model_type: str, known_als_genes=None):



    # default known ALS genes if none provided
    if known_als_genes is None:
        known_als_genes = {
            "ANXA11", "C9ORF72", "CHCHD10", "EPHA4", "FUS", "HNRNPA1", "KIF5A", "NEK1",
            "OPTN", "PFN1", "SOD1", "TARDBP", "TDP-43", "TDP43", "TBK1", "UBQLN2",
            "UNC13A", "VAPB", "VCP"
        }

    known_als_genes = {g.upper() for g in known_als_genes}

    # get embeddings for known ALS genes
    known_embeddings, valid_known_genes = get_embeddings(
        list(known_als_genes), model, model_type
    )

    if len(valid_known_genes) == 0:
        print("No known genes found in model.")
        return pd.DataFrame()

    known_embeddings = normalize(known_embeddings, axis=1) # normalize it

    # get embeddings for candidate genes
    #candidates = [gene.upper() for gene in gene_score_dict]
    candidates = [gene.upper() for gene in genes]

    candidate_embeddings, valid_candidates = get_embeddings(
        candidates, model, model_type
    )

    if len(valid_candidates) == 0:
        print("No candidate embeddings generated.")
        return pd.DataFrame()

    candidate_embeddings = normalize(candidate_embeddings, axis=1) # normalize it

    # cosine similarity between candidates and known genes
    similarity_matrix = cosine_similarity(candidate_embeddings, known_embeddings)


    # # here we are preventing a known gene to be compared to itself (if it's also a candidate)
    # known_gene_to_idx = {gene.upper(): i for i, gene in enumerate(valid_known_genes)}


    # for i, candidate_gene in enumerate(valid_candidates):

    #     if candidate_gene.upper() in known_gene_to_idx:

    #         j = known_gene_to_idx[candidate_gene.upper()]

    #         similarity_matrix[i, j] = 0.0 # if the candidate is a known gene, we prevent it from comparing to itself


    max_similarities = np.max(similarity_matrix, axis=1) # take the MAX score

    # create results dataframe with combined scores
    results_df = pd.DataFrame({
        'gene': [g.upper() for g in valid_candidates],
        'sim_raw': max_similarities
    })


    # normalize scores before combining them with alpha (just testing)
    for col in ['sim_raw']:
        min_val, max_val = results_df[col].min(), results_df[col].max()

        results_df[f'{col}_norm'] = (results_df[col] - min_val) / (max_val - min_val + 1e-9)

    return results_df.sort_values('sim_raw', ascending=False)



def calculate_ranking_dot_product(genes, model, model_type: str):


    # normalize gene names
    candidates = [g.lower() for g in genes]

    # get candidate embeddings
    candidate_embeddings, valid_candidates = get_embeddings(candidates, model, model_type)
    if len(valid_candidates) == 0:
        print("No candidate gene found in model.")
        return pd.DataFrame()

    candidate_embeddings = normalize(candidate_embeddings, axis=1)

    # "ALS" embedding
    als_word = "als"
    if model_type == 'fasttext':
        if model.get_word_id(als_word) == -1:
            print("'ALS' not found in fastText vocab")
            return pd.DataFrame()
        als_embedding = model.get_word_vector(als_word)

    elif model_type == 'word2vec':
        if als_word not in model.wv:
            print("'ALS' not found in word2vec vocab")
            return pd.DataFrame()
        als_embedding = model.wv[als_word]
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

    als_embedding = als_embedding / np.linalg.norm(als_embedding)  # normalize

    # calculate dot product with ALS
    dot_scores = candidate_embeddings @ als_embedding

    # returns the results in a df
    results_df = pd.DataFrame({
        'gene': [g.upper() for g in valid_candidates],
        'dot_with_als': dot_scores
    })

    results_df = results_df.sort_values('dot_with_als', ascending=False).reset_index(drop=True)

    return results_df





# Helpful functions

In [None]:

# txt for fasttext
def save_corpus_for_fasttext(df, filepath="fasttext_corpus.txt"):
    with open(filepath, "w", encoding="utf-8") as f:
        for text in df['clean_text']:
            f.write(text + "\n")

    print(f"Corpus saved to {filepath}")



def get_word2vec_model(df, corpus_path="word2vec_corpus.txt", model_path=f"word2vec_model{YEAR_END}.bin"):

    if os.path.exists(model_path):
        print(f"Loading existing Word2Vec model from {model_path}...")
        model = Word2Vec.load(model_path)

    else:
        print("Training new Word2Vec model...")

        # Build sentences for training
        sentences = [simple_preprocess(text) for text in df['clean_text']]

        model = Word2Vec(
            sentences=sentences,
            vector_size=300,  # embedding dimension
            window=5,         # context window
            min_count=2,      # ignore words that appear < 2 times
            workers=4,
            sg=1              # skip-gram
        )

        model.save(model_path)
        print(f"Word2Vec model trained and saved to {model_path}")

    return model




# train or load fastText model
def get_fasttext_model(corpus_path="fasttext_corpus.txt", model_path=f"fasttext_model_{YEAR_END}.bin", force_retrain=False):

    if not force_retrain and os.path.exists(model_path):
        print(f"Loading pre-trained model from {model_path}")
        return fasttext.load_model(model_path)

    print("Training new FastText model...")
    model = fasttext.train_unsupervised(
        corpus_path,
        model='skipgram',
        dim=300,
        epoch=10,
        minn=3,
        maxn=6,
        ws=10,
        lr=0.05,
        minCount=2, # adjust to 2
        thread=4)

    model.save_model(model_path)
    print(f"Model saved to {model_path}")

    return model


# Metrics

In [None]:

# precision at k - fraction of relevant items in top k results
def calculate_precision_at_k(ranked_list, validation_set, k):
    top_k = ranked_list[:k]
    hits = len(set(top_k) & validation_set)

    return hits / k if k > 0 else 0

# recall at k - fraction of relevant items found in top k
def calculate_recall_at_k(ranked_list, validation_set, k):

    top_k = ranked_list[:k]
    hits = len(set(top_k) & validation_set)

    return hits / len(validation_set) if len(validation_set) > 0 else 0

# mean reciprocal rank of first relevant item
def calculate_mrr(ranked_list, validation_set):

    for i, item in enumerate(ranked_list):
        if item in validation_set:
            return 1 / (i + 1)
    return 0.0

# Main

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import average_precision_score, ndcg_score
from collections import defaultdict
import os
import torch
from transformers import AutoTokenizer, AutoModel
import ast
from gensim.models import Word2Vec

MODEL_CHOICE = 'fasttext' # choose the model: 'fasttext' or 'word2vec'

MODEL_MAP = {} # we may include more models in this dict

VALIDATION_GENES = {
    "ANXA11", "C9ORF72", "CHCHD10", "EPHA4", "FUS", "HNRNPA1", "KIF5A", "NEK1",
    "OPTN", "PFN1", "SOD1", "TARDBP", "TDP-43", "TDP43", "TBK1", "UBQLN2",
    "UNC13A", "VAPB", "VCP"
}
VALIDATION_GENES = {g.upper() for g in VALIDATION_GENES}


if __name__ == "__main__":

    print("Starting pipeline...\n")

    # data loading and processing
    csv_path = f"./drive/MyDrive/IC_2025/als_articles_2010_3173_biased_updated.csv"
    df = pd.DataFrame()

    if os.path.exists(csv_path):
        try:
            df = pd.read_csv(csv_path)
            if not df.empty and isinstance(df['genes'].iloc[0], str):
                 df['genes'] = df['genes'].apply(lambda x: eval(x))

                # convert string to list
                #  df['genes'] = df['genes'].apply(ast.literal_eval)

                # get only articles that contain genes --> filtering articles with no genes
                 df = df[df['genes'].apply(len) > 0]



            print(f"Existing data loaded from {csv_path} ({len(df)} articles).")
        except (pd.errors.EmptyDataError, pd.errors.ParserError):
            # df = get_genetic_als_articles()
            print("Error: the dataframe is empty or this is another unknown error")

    else:

        articles_2000_2001 = get_als_genetic_articles(BASE_QUERY, 2000, 2001, max_articles=10000)
        articles_2001_2002 = get_als_genetic_articles(BASE_QUERY, 2001, 2002, max_articles=10000)
        articles_2002_2003 = get_als_genetic_articles(BASE_QUERY, 2002, 2003, max_articles=10000)
        articles_2003_2004 = get_als_genetic_articles(BASE_QUERY, 2003, 2004, max_articles=10000)
        articles_2004_2005 = get_als_genetic_articles(BASE_QUERY, 2004, 2005, max_articles=10000)
        articles_2005_2006 = get_als_genetic_articles(BASE_QUERY, 2005, 2006, max_articles=10000)
        articles_2006_2007 = get_als_genetic_articles(BASE_QUERY, 2006, 2007, max_articles=10000)
        articles_2007_2008 = get_als_genetic_articles(BASE_QUERY, 2007, 2008, max_articles=10000)
        articles_2008_2009 = get_als_genetic_articles(BASE_QUERY, 2008, 2009, max_articles=10000)
        articles_2009_2010 = get_als_genetic_articles(BASE_QUERY, 2009, 2010, max_articles=10000)
        # articles_2010_2011 = get_als_genetic_articles(BASE_QUERY, 2010, 2011, max_articles=10000)
        # articles_2011_2012 = get_als_genetic_articles(BASE_QUERY, 2011, 2012, max_articles=10000)
        # articles_2012_2013 = get_als_genetic_articles(BASE_QUERY, 2012, 2013, max_articles=10000)
        # articles_2013_2014 = get_als_genetic_articles(BASE_QUERY, 2013, 2014, max_articles=10000)
        # articles_2014_2015 = get_als_genetic_articles(BASE_QUERY, 2014, 2015, max_articles=10000)
        # articles_2015_2016 = get_als_genetic_articles(BASE_QUERY, 2015, 2016, max_articles=10000)
        # articles_2016_2017 = get_als_genetic_articles(BASE_QUERY, 2016, 2017, max_articles=10000)
        # articles_2017_2018 = get_als_genetic_articles(BASE_QUERY, 2017, 2018, max_articles=10000)
        # articles_2018_2019 = get_als_genetic_articles(BASE_QUERY, 2018, 2019, max_articles=10000)
        # articles_2019_2020 = get_als_genetic_articles(BASE_QUERY, 2019, 2020, max_articles=10000)
        # articles_2020_2021 = get_als_genetic_articles(BASE_QUERY, 2020, 2021, max_articles=10000)
        # articles_2021_2022 = get_als_genetic_articles(BASE_QUERY, 2021, 2022, max_articles=10000)
        # articles_2022_2023 = get_als_genetic_articles(BASE_QUERY, 2022, 2023, max_articles=10000)
        # articles_2023_2024 = get_als_genetic_articles(BASE_QUERY, 2023, 2024, max_articles=10000)
        # articles_2024_2025 = get_als_genetic_articles(BASE_QUERY, 2024, 2025, max_articles=10000)

        # df = pd.DataFrame(
        #     articles_2000_2001 + articles_2001_2002 + articles_2002_2003 + articles_2003_2004 +
        #     articles_2004_2005 + articles_2005_2006 + articles_2006_2007 + articles_2007_2008 +
        #     articles_2008_2009 + articles_2009_2010 + articles_2010_2011 + articles_2011_2012 +
        #     articles_2012_2013 + articles_2013_2014 + articles_2014_2015 + articles_2015_2016 +
        #     articles_2016_2017 + articles_2017_2018 + articles_2018_2019 + articles_2019_2020 +
        #     articles_2020_2021 + articles_2021_2022 + articles_2022_2023 + articles_2023_2024 +
        #     articles_2024_2025
        # )
        df = pd.DataFrame(
            articles_2000_2001 + articles_2001_2002 + articles_2002_2003 + articles_2003_2004 +
            articles_2004_2005 + articles_2005_2006 + articles_2006_2007 + articles_2007_2008 +
            articles_2008_2009 + articles_2009_2010
        )






        #df.to_csv("pubmed_articles.csv", index=False)
        df.to_csv(f"./drive/MyDrive/IC_2025/als_articles_2010_{len(df)}_biased_updated.csv", index=False)
        print("Total collected:", len(df))


    # extracting and validating genes
    if not df.empty:
        if 'clean_text' not in df.columns:
            df['text'] = df['text'].fillna('')
            df['clean_text'] = df['text'].apply(clean_text)
        if 'genes' not in df.columns:
            df['genes'] = df['text'].apply(extract_genes_unbiased)

            if VALIDATION_GENES and MYGENE_AVAILABLE:
                valid_genes = validate_genes_with_mygene(set(g for gl in df['genes'] for g in gl))
                df['genes'] = df['genes'].apply(lambda genes: [g for g in genes if g in valid_genes])


        df.to_csv(csv_path, index=False)

        print(f"Processed data saved to '{csv_path}'.")




        # loading/training the model
        embedding_model = None

        if MODEL_CHOICE == 'fasttext':
            print("Initializing fastText...")

            corpus_filepath = f"fasttext_corpus_2010.txt"
            model_filepath = f"./drive/MyDrive/IC_2025/fasttext_model_2010_3173_biased_updated.bin"

            if not os.path.exists(corpus_filepath):
                save_corpus_for_fasttext(df, filepath=corpus_filepath)

            embedding_model = get_fasttext_model(corpus_path=corpus_filepath, model_path=model_filepath)

        elif MODEL_CHOICE == 'word2vec':
            print("Initializing word2vec...")


            corpus_filepath = "fasttext_corpus.txt" # change for a general txt
            model_filepath = f"./drive/MyDrive/IC_2025/word2vec_model_2010_3173_biased_updated.bin"

            if not os.path.exists(corpus_filepath):
                save_corpus_for_fasttext(df, filepath=corpus_filepath)

            embedding_model = get_word2vec_model(df, corpus_path=corpus_filepath, model_path=model_filepath)


        # ranking
        if embedding_model:

            all_genes = list({g.upper() for gene_list in df['genes'] for g in gene_list})


            ranked_genes_full = calculate_ranking_cosine(
                genes=all_genes,
                model=embedding_model,
                model_type=MODEL_CHOICE,
                known_als_genes=VALIDATION_GENES
            )

            ranked_dot_product = calculate_ranking_dot_product(
                genes=all_genes,
                model=embedding_model,
                model_type=MODEL_CHOICE
            )

            # # metrics calculation (on list with known genes)
            # print("\nCalculating performance metrics...")

            # y_true = ranked_genes_full['gene'].str.upper().apply(lambda x: 1 if x in VALIDATION_GENES else 0).values
            # y_score = ranked_genes_full['combined'].values # confirm this
            # ranked_list_full = ranked_genes_full['gene'].str.upper().tolist()

            # metrics = {
            #     'P@10': calculate_precision_at_k(ranked_list_full, VALIDATION_GENES, 10),
            #     'P@20': calculate_precision_at_k(ranked_list_full, VALIDATION_GENES, 20),
            #     'R@50': calculate_recall_at_k(ranked_list_full, VALIDATION_GENES, 50),
            #     'MAP': average_precision_score(y_true, y_score),
            #     'nDCG': ndcg_score([y_true], [y_score]),
            #     'MRR': calculate_mrr(ranked_list_full, VALIDATION_GENES)
            # }
            # metrics_df = pd.DataFrame([metrics], index=[MODEL_CHOICE.capitalize()])

            # filtering for novel candidates
            ranked_novel_genes = ranked_genes_full[~ranked_genes_full['gene'].isin(VALIDATION_GENES)]



            # to make temporal analysis --> use ranked_genes_full
            # to discover new candidate genes --> use ranked_novel_genes

            if not ranked_novel_genes.empty:
                print(f"\n--- TOP 20 NOVEL CANDIDATES (Model: {MODEL_CHOICE.upper()}) ---")
                for i, row in enumerate(ranked_novel_genes.head(20).itertuples(), 1):
                    print(f"{i}. {row.gene.upper():<10} | Sim: {row.sim_raw_norm:.4f})")

                output_filename = f'als_novel_gene_candidates_{MODEL_CHOICE}.csv'
                ranked_novel_genes.to_csv(output_filename, index=False)
                print(f"\nResults with novel genes saved to '{output_filename}'")


            if not ranked_dot_product.empty:
                print(f"\n--- TOP 20 GENES RANKED BY DOT PRODUCT WITH 'ALS' (Model: {MODEL_CHOICE.upper()}) ---")
                for i, row in enumerate(ranked_dot_product.head(20).itertuples(), 1):
                    print(f"{i}. {row.gene:<10} | Dot with ALS: {row.dot_with_als:.4f}")

                ranked_dot_validation = ranked_dot_product[
                  ranked_dot_product['gene'].apply(lambda g: str(g).upper() in VALIDATION_GENES)
                ]


                if not ranked_dot_product.empty:
                    print(f"\n--- TOP known genes GENES RANKED BY DOT PRODUCT WITH 'ALS' (Model: {MODEL_CHOICE.upper()}) ---")
                    for i, row in enumerate(ranked_dot_product.itertuples(), 1):
                        if row.gene.upper() in VALIDATION_GENES:
                            print(f"{i}. {row.gene:<10} | Dot with ALS: {row.dot_with_als:.4f}")
                else:
                    print("\nNo validation genes were found.")


            # print("\n--- Performance Metrics Table ---")
            # print(metrics_df.round(4))

Starting pipeline...

Existing data loaded from ./drive/MyDrive/IC_2025/als_articles_2010_3173_biased_updated.csv (3173 articles).
Processed data saved to './drive/MyDrive/IC_2025/als_articles_2010_3173_biased_updated.csv'.
Initializing fastText...
Loading pre-trained model from ./drive/MyDrive/IC_2025/fasttext_model_2010_3173_biased_updated.bin

--- TOP 20 NOVEL CANDIDATES (Model: FASTTEXT) ---
1. VAPA       | Sim: 0.7519)
2. NIPA1      | Sim: 0.7262)
3. BSCL2      | Sim: 0.6360)
4. ATF6       | Sim: 0.5952)
5. HSPB8      | Sim: 0.5705)
6. DBP        | Sim: 0.5595)
7. CPA1       | Sim: 0.5589)
8. HSPB1      | Sim: 0.5445)
9. EEA1       | Sim: 0.5414)
10. RCC1       | Sim: 0.5411)
11. KIFAP3     | Sim: 0.5406)
12. UCHL1      | Sim: 0.5295)
13. RHOA       | Sim: 0.5252)
14. RTN1       | Sim: 0.5251)
15. FUSE       | Sim: 0.5217)
16. PLEKHG5    | Sim: 0.5213)
17. DDX20      | Sim: 0.5119)
18. SQSTM1     | Sim: 0.4945)
19. CHMP2B     | Sim: 0.4877)
20. CHRNA3     | Sim: 0.4820)

Results w