In [16]:
# Resolve version conflicts
!pip uninstall -y scispacy thinc spacy
!pip install spacy==3.4.4
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz

import pandas as pd
import spacy
from collections import Counter

# Load compatible biomedical model
nlp = spacy.load("en_core_sci_sm", disable=["tagger", "parser"])

# Load dataset
df = pd.read_csv("/kaggle/input/dataset/train.csv")

def extract_medical_entities(texts):
    """Extract medical terms using biomedical NER"""
    medical_terms = []
    for doc in nlp.pipe(texts, batch_size=1000):
        for ent in doc.ents:
            # Capture all medical entities (diseases, chemicals, symptoms)
            medical_terms.append(ent.text.lower())
    return medical_terms

# Process texts by label
side_effect_texts = df[df['labels'] == 1]['text']
no_effect_texts = df[df['labels'] == 0]['text']

# Extract medical terms
side_effect_terms = extract_medical_entities(side_effect_texts)
no_effect_terms = extract_medical_entities(no_effect_texts)

# Count frequencies
side_effect_counts = Counter(side_effect_terms)
no_effect_counts = Counter(no_effect_terms)

# Create DataFrames
side_effect_df = pd.DataFrame(side_effect_counts.most_common(), 
                              columns=['medical_term', 'count'])
side_effect_df['label'] = '1 (Side Effect)'

no_effect_df = pd.DataFrame(no_effect_counts.most_common(), 
                            columns=['medical_term', 'count'])
no_effect_df['label'] = '0 (No Effect)'

# Combine and save
medical_df = pd.concat([side_effect_df, no_effect_df])
medical_df.to_csv("auto_medical_terms.csv", index=False)

# Print top 10 terms for verification
print("Top medical terms for Side Effect:")
print(side_effect_df.head(10))
print("\nTop medical terms for No Effect:")
print(no_effect_df.head(10))
print("\nMedical terms saved: auto_medical_terms.csv")


Found existing installation: scispacy 0.5.5
Uninstalling scispacy-0.5.5:
  Successfully uninstalled scispacy-0.5.5
Found existing installation: thinc 8.1.12
Uninstalling thinc-8.1.12:
  Successfully uninstalled thinc-8.1.12
Found existing installation: spacy 3.4.4
Uninstalling spacy-3.4.4:
  Successfully uninstalled spacy-3.4.4
Collecting spacy==3.4.4
  Using cached spacy-3.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting thinc<8.2.0,>=8.1.0 (from spacy==3.4.4)
  Using cached thinc-8.1.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Using cached spacy-3.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
Using cached thinc-8.1.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (917 kB)
Installing collected packages: thinc, spacy
Successfully installed spacy-3.4.4 thinc-8.1.12
Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
  U

  


Top medical terms for Side Effect:
   medical_term  count            label
0      shingles   1165  1 (Side Effect)
1       vaccine   1126  1 (Side Effect)
2             i    970  1 (Side Effect)
3          shot    636  1 (Side Effect)
4          days    380  1 (Side Effect)
5          else    339  1 (Side Effect)
6          dose    297  1 (Side Effect)
7      symptoms    287  1 (Side Effect)
8  side effects    283  1 (Side Effect)
9      reaction    271  1 (Side Effect)

Top medical terms for No Effect:
  medical_term  count          label
0     shingles   1480  0 (No Effect)
1            i   1072  0 (No Effect)
2      vaccine    708  0 (No Effect)
3        years    222  0 (No Effect)
4         rash    213  0 (No Effect)
5         days    208  0 (No Effect)
6       months    208  0 (No Effect)
7          i'm    195  0 (No Effect)
8         shot    182  0 (No Effect)
9     vaccines    180  0 (No Effect)

Medical terms saved: auto_medical_terms.csv
