In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize CountVectorizer to convert text into term frequency matrix
vectorizer = CountVectorizer()

# Fit and transform the medical transcripts
tf_matrix = vectorizer.fit_transform(medical_transcripts)

# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Print the term frequency matrix
print("Term Frequency Matrix:")
print(tf_matrix.toarray())

# Print the feature names
print("\nFeature Names:")
print(feature_names)


Term Frequency Matrix:
[[1 0 0 1 0 0 1 0 0 0 0 2 1 0 0 1 0 1 1 0 0 1]
 [1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 1 1 0]
 [1 1 1 0 1 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0]]

Feature Names:
['and' 'antibiotics' 'as' 'breath' 'bronchitis' 'confirmed' 'cough'
 'diagnosis' 'elevated' 'examination' 'inhaler' 'of' 'patient' 'physical'
 'prescribed' 'presented' 'revealed' 'shortness' 'symptoms' 'temperature'
 'wheezing' 'with']


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize TfidfVectorizer to convert text into TF-IDF matrix
vectorizer = TfidfVectorizer()

# Fit and transform the medical transcripts
tfidf_matrix = vectorizer.fit_transform(medical_transcripts)

# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Print the TF-IDF matrix
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

# Print the feature names
print("\nFeature Names:")
print(feature_names)


TF-IDF Matrix:
[[0.17531933 0.         0.         0.29684142 0.         0.
  0.29684142 0.         0.         0.         0.         0.59368285
  0.29684142 0.         0.         0.29684142 0.         0.29684142
  0.29684142 0.         0.         0.29684142]
 [0.2344005  0.         0.         0.         0.         0.
  0.         0.         0.39687454 0.39687454 0.         0.
  0.         0.39687454 0.         0.         0.39687454 0.
  0.         0.39687454 0.39687454 0.        ]
 [0.21786941 0.36888498 0.36888498 0.         0.36888498 0.36888498
  0.         0.36888498 0.         0.         0.36888498 0.
  0.         0.         0.36888498 0.         0.         0.
  0.         0.         0.         0.        ]]

Feature Names:
['and' 'antibiotics' 'as' 'breath' 'bronchitis' 'confirmed' 'cough'
 'diagnosis' 'elevated' 'examination' 'inhaler' 'of' 'patient' 'physical'
 'prescribed' 'presented' 'revealed' 'shortness' 'symptoms' 'temperature'
 'wheezing' 'with']


In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize CountVectorizer to convert text into binary matrix
vectorizer = CountVectorizer(binary=True)

# Fit and transform the medical transcripts
binary_matrix = vectorizer.fit_transform(medical_transcripts)

# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Print the binary matrix
print("Binary Matrix:")
print(binary_matrix.toarray())

# Print the feature names
print("\nFeature Names:")
print(feature_names)


Binary Matrix:
[[1 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0 1]
 [1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 1 1 0]
 [1 1 1 0 1 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0]]

Feature Names:
['and' 'antibiotics' 'as' 'breath' 'bronchitis' 'confirmed' 'cough'
 'diagnosis' 'elevated' 'examination' 'inhaler' 'of' 'patient' 'physical'
 'prescribed' 'presented' 'revealed' 'shortness' 'symptoms' 'temperature'
 'wheezing' 'with']


In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize CountVectorizer to convert text into term frequency matrix
vectorizer = CountVectorizer()

# Fit and transform the medical transcripts
tf_matrix = vectorizer.fit_transform(medical_transcripts)

# Calculate logarithmic TF matrix
log_tf_matrix = np.log1p(tf_matrix.toarray())

# Print the logarithmic TF matrix
print("Logarithmic TF Matrix:")
print(log_tf_matrix)

# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Print the feature names
print("\nFeature Names:")
print(feature_names)


Logarithmic TF Matrix:
[[0.69314718 0.         0.         0.69314718 0.         0.
  0.69314718 0.         0.         0.         0.         1.09861229
  0.69314718 0.         0.         0.69314718 0.         0.69314718
  0.69314718 0.         0.         0.69314718]
 [0.69314718 0.         0.         0.         0.         0.
  0.         0.         0.69314718 0.69314718 0.         0.
  0.         0.69314718 0.         0.         0.69314718 0.
  0.         0.69314718 0.69314718 0.        ]
 [0.69314718 0.69314718 0.69314718 0.         0.69314718 0.69314718
  0.         0.69314718 0.         0.         0.69314718 0.
  0.         0.         0.69314718 0.         0.         0.
  0.         0.         0.         0.        ]]

Feature Names:
['and' 'antibiotics' 'as' 'breath' 'bronchitis' 'confirmed' 'cough'
 'diagnosis' 'elevated' 'examination' 'inhaler' 'of' 'patient' 'physical'
 'prescribed' 'presented' 'revealed' 'shortness' 'symptoms' 'temperature'
 'wheezing' 'with']


In [8]:
from rank_bm25 import BM25Okapi

# Example medical transcripts data
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Tokenize the medical transcripts
tokenized_transcripts = [transcript.split() for transcript in medical_transcripts]

# Create BM25 object
bm25 = BM25Okapi(tokenized_transcripts)

# Example query
query = "Patient presented with cough symptoms and difficulty breathing."

# Tokenize the query
tokenized_query = query.split()

# Get BM25 scores for the query
bm25_scores = bm25.get_scores(tokenized_query)

# Print BM25 scores for each transcript
print("BM25 Scores:")
for i, score in enumerate(bm25_scores, 1):
    print(f"Transcript {i}: {score}")


BM25 Scores:
Transcript 1: 2.4347863053280485
Transcript 2: 0.10753120040691722
Transcript 3: 0.10161807940694419


In [4]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import CountVectorizer

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Tokenize the medical transcripts
tokenized_medical_transcripts = [doc.split() for doc in medical_transcripts]

# Initialize BM25Okapi model
bm25 = BM25Okapi(tokenized_medical_transcripts)

# Define query
query = "symptoms of cough"

# Tokenize the query
tokenized_query = query.split()

# Get BM25 scores for the query
bm25_scores = bm25.get_scores(tokenized_query)

# Print BM25 scores
print("BM25 Scores:")
print(bm25_scores)


ModuleNotFoundError: No module named 'rank_bm25'

In [6]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize CountVectorizer to convert text into term frequency matrix
vectorizer = CountVectorizer()

# Fit and transform the medical transcripts
tf_matrix = vectorizer.fit_transform(medical_transcripts)

# Calculate PMI scores
total_docs = len(medical_transcripts)
term_frequency = tf_matrix.sum(axis=0)
term_frequency = np.array(term_frequency).flatten()

pmi_scores = []
for term_idx, term in enumerate(vectorizer.get_feature_names_out()):
    term_occurrences = term_frequency[term_idx]
    doc_occurrences = np.sum(tf_matrix[:, term_idx] > 0)
    print(term)
    if doc_occurrences == 0:
        pmi = 0
    else:
        pmi = np.log2((term_occurrences / total_docs) / (doc_occurrences / total_docs))
    pmi_scores.append(pmi)

# Print PMI scores
print("PMI Scores:")
print(pmi_scores)


and
antibiotics
as
breath
bronchitis
confirmed
cough
diagnosis
elevated
examination
inhaler
of
patient
physical
prescribed
presented
revealed
shortness
symptoms
temperature
wheezing
with
PMI Scores:
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [12]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import chi2_contingency

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize CountVectorizer to convert text into term frequency matrix
vectorizer = CountVectorizer()

# Fit and transform the medical transcripts
tf_matrix = vectorizer.fit_transform(medical_transcripts)

# Calculate Chi-Square scores
term_frequency = tf_matrix.sum(axis=0)
term_frequency = np.array(term_frequency).flatten()

chi_square_scores = []
for term_idx, term in enumerate(vectorizer.get_feature_names_out()):
    term_occurrences = term_frequency[term_idx]
    doc_occurrences = np.sum(tf_matrix[:, term_idx] > 0)
    doc_not_occurrences = tf_matrix.shape[0] - doc_occurrences
    term_not_occurrences = tf_matrix.shape[1] - term_occurrences
    print(term)

    observed = np.array([[term_occurrences, doc_occurrences],
                         [term_not_occurrences, doc_not_occurrences]])
    chi2, _, _, _ = chi2_contingency(observed)
    chi_square_scores.append(chi2)

# Print Chi-Square scores
print("Chi-Square Scores:")
print(chi_square_scores)


and
antibiotics
as
breath
bronchitis
confirmed
cough
diagnosis
elevated
examination
inhaler
of
patient
physical
prescribed
presented
revealed
shortness
symptoms
temperature
wheezing
with
Chi-Square Scores:
[6.5797780435938344, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.0703053259871442, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626, 0.3479084321475626]


In [7]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize CountVectorizer to convert text into term frequency matrix
vectorizer = CountVectorizer()

# Fit and transform the medical transcripts
tf_matrix = vectorizer.fit_transform(medical_transcripts)

# Calculate Information Gain scores
information_gain_scores = mutual_info_classif(tf_matrix, [0, 1, 2], discrete_features=True)

# Print Information Gain scores
print("Information Gain Scores:")
print(information_gain_scores)


Information Gain Scores:
[0.         0.63651417 0.63651417 0.63651417 0.63651417 0.63651417
 0.63651417 0.63651417 0.63651417 0.63651417 0.63651417 0.63651417
 0.63651417 0.63651417 0.63651417 0.63651417 0.63651417 0.63651417
 0.63651417 0.63651417 0.63651417 0.63651417]
