In [None]:
pip install firebase-admin

# Connect to Firebase Realtime Database

In [1]:
import firebase_admin
from firebase_admin import credentials, db

# Initialize Firebase Admin SDK with service account key
cred = credentials.Certificate('./serviceAccountKey.json')
firebase_admin.initialize_app(cred, {
    'databaseURL': 'https://pinellia-66b18-default-rtdb.firebaseio.com'
})

<firebase_admin.App at 0x1967d6095d0>

## Load herb descriptions from database

In [13]:
# Reference to the "herbs_desc" table in the database
herbs_desc_ref = db.reference('herbs_desc')

# Retrieve data from the "herbs_desc" table
herbs_desc_data = herbs_desc_ref.get()

# Initialize an empty Python array to store the retrieved data
herbs_desc_array = []

# Check if there's data in the "herbs_desc" table
if herbs_desc_data:
    # Iterate through the data and append it to the array
    for herb_id, herb_data in herbs_desc_data.items():
        herbs_desc_array.append(herb_data)

# Print the retrieved data
for herb in herbs_desc_array:
    print(herb)

{'description': 'This product is more shrunken, broken, and has a short handle. The complete leaves are oval-elliptic after flattening, pinnately deeply lobed, lobes elliptic-lanceolate, with irregular coarse serrations on the edges; the upper surface is gray-green or dark yellow-green, with sparse pilose and glandular points; the lower The surface is densely gray-white fluff. Soft texture. Gas delicate fragrance, bitter taste.1. Artemisia argyi: Remove impurities and stems, and sieve to remove dust. 2. Vinegar moxa charcoal: take clean mugwort leaves, put them in a pot, heat them with strong fire , fry until the surface is burnt black, spray vinegar, fry dry, take out and cool thoroughly. For every 100kg of mugwort leaves, use 15kg of vinegar. The finished product is scorched black irregular fragments with thin strip-like petioles visible , and has a vinegar aroma.Warming meridian for hemostasis, expelling cold for relieving pain; external use for dispelling dampness and relieving itc

# TF-IDF (Term Frequency - Inverse Document Frequency) Recommendation System

## Import Libraries

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
from collections import Counter
import numpy as np  # Import numpy for handling NaN values

## Calculate TF-IDF scores for all description text

In [17]:
herb_desc = [herb['description'] for herb in herbs_desc_array]

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Calculate TF-IDF scores for the herb descriptions
tfidf_matrix = tfidf_vectorizer.fit_transform(herb_desc)
terms = tfidf_vectorizer.get_feature_names_out()

# Calculate cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Print the TF-IDF scores for the first herb
herb_index = 0
tfidf_scores_for_herb = dict(zip(terms, tfidf_matrix[herb_index].toarray()[0]))

# Sort the TF-IDF scores by value in descending order
sorted_tfidf_scores = sorted(tfidf_scores_for_herb.items(), key=lambda x: x[1], reverse=True)

# Print the sorted TF-IDF scores
for term, score in sorted_tfidf_scores:
    print(f"{term}: {score}")

and: 0.5112723840504344
mugwort: 0.40894374508015446
bleeding: 0.2218266194647045
the: 0.1859172305637943
cold: 0.1844279365373569
menstruation: 0.1584475853319318
moxa: 0.14313031077805408
to: 0.13943792292284574
pain: 0.135983407369629
dampness: 0.13152510319274627
leaves: 0.12926796680921823
itching: 0.12675806826554542
vinegar: 0.10652963424053506
for: 0.10623841746502533
used: 0.10623841746502533
effects: 0.1053715086369361
metrorrhagia: 0.10223593627003862
is: 0.09959851637346123
warm: 0.09703172041619415
can: 0.09295861528189715
in: 0.09295861528189715
it: 0.09295861528189715
has: 0.08631871419033307
of: 0.079678813098769
relieving: 0.07322183473749254
irregular: 0.07277379031214562
stop: 0.07181553711623236
abdominal: 0.0710197561603567
charcoal: 0.06337903413277271
relieve: 0.06276157263213647
menorrhagia: 0.06134156176202316
uterine: 0.06134156176202316
wormwood: 0.06134156176202316
be: 0.05654744390810698
on: 0.05654744390810698
remove: 0.05613024155484775
coagulation: 0.053

## Herb recommendation based on a single symptom

In [18]:
# Extract herb names and symptoms from the dataset
herb_desc = [herb['description'] for herb in herbs_desc_array]
herb_names = [herb['name'] for herb in herbs_desc_array]

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the vectorizer on the herb symptoms
tfidf_matrix = tfidf_vectorizer.fit_transform(herb_desc)

# User input on single symptom
user_input = "fever"

# Transform user input using the same vectorizer
user_tfidf = tfidf_vectorizer.transform([user_input])

# Calculate cosine similarities between user input and herb symptoms
cosine_similarities = linear_kernel(user_tfidf, tfidf_matrix)

# Get the indices of herbs sorted by similarity (higher is better)
recommended_herb_indices = list(reversed(cosine_similarities.argsort()[0]))

# Print the recommended herbs based on similarity
print("Recommended Herbs:")
for index in recommended_herb_indices:
    print(herb_names[index])

print(cosine_similarities)
print(recommended_herb_indices)

Recommended Herbs:
Cork-Tree Bark
Chinese Yam
Astragalus
Japanese Honeysuckle
Chinese Arisaema
Ginseng
Monkfruit
Tuber Fleeceflower
Wolfberry / Gojiberry
Licorice
Poria Cocos
Cordyceps
Codonopsis Root
Lily Bulbs
Mugwort
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.05013557 0.01408089 0.01258081 0.
  0.         0.01526044 0.        ]]
[8, 13, 9, 10, 14, 12, 11, 7, 6, 5, 4, 3, 2, 1, 0]


## Calculate tf-idf scores for all herbs based on a list of predefined symptoms

In [19]:
# Extract herb names and descriptions
herb_names = [herb['name'] for herb in herbs_desc_array]
herb_desc = [herb['description'] for herb in herbs_desc_array]

# General list of symptoms
general_symptoms = ['abdominal pain', 'beauty', 'bleeding', 'blood', 'blood pressure', 'cancer', 
                    'cold', 'cough', 'constipation', 'cramps', 'dampness', 'diarrhea', 'epistaxis', 
                    'fatigue', 'fever', 'gastric', 'heat', 'insomnia', 'itching', 'menstruation', 
                    'mind', 'phlegm', 'qi', 'swelling', 'warm', 'yin']

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the vectorizer on herb descriptions
tfidf_matrix = tfidf_vectorizer.fit_transform(herb_desc)

# Create an empty DataFrame with columns for herbs and symptoms
tfidf_df = pd.DataFrame(columns=['Herb'] + general_symptoms)

# Set the 'Herb' column in the DataFrame
tfidf_df['Herb'] = herb_names

# Iterate through general symptoms and calculate TF-IDF scores for each herb
for symptom in general_symptoms:
    # Create a new TF-IDF vectorizer for the specific symptom
    symptom_vectorizer = TfidfVectorizer(vocabulary=tfidf_vectorizer.vocabulary_)
    
    # Transform herb descriptions for the specific symptom
    symptom_tfidf_matrix = symptom_vectorizer.fit_transform([text for text in herb_desc if symptom in text.lower()])
    
    # Initialize an array of NaN values for each herb
    herb_tfidf_scores = []

    # Iterate through herbs and calculate TF-IDF scores for the symptom
    for i, herb_name in enumerate(herb_names):
        # Check if the herb has the symptom in its description
        if symptom in herb_desc[i].lower():
            # Get the TF-IDF score for the symptom for the specific herb
            tfidf_score = symptom_tfidf_matrix[:, i].mean()
            
            # Append the TF-IDF score to the list
            herb_tfidf_scores.append(round(tfidf_score, 10))
            
        else:
            # If the herb doesn't have the symptom, append NaN and 0 word count
            herb_tfidf_scores.append(np.nan)
    
    # Add the TF-IDF scores to the DataFrame
    tfidf_df[symptom] = herb_tfidf_scores

pd.set_option('display.max_columns', None)
num_rows, num_columns = tfidf_df.shape
print(f"{num_rows} rows x {num_columns} columns ")

# Print the DataFrame with symptom-herb TF-IDF scores
tfidf_df


15 rows x 27 columns 


Unnamed: 0,Herb,abdominal pain,beauty,bleeding,blood,blood pressure,cancer,cold,cough,constipation,cramps,dampness,diarrhea,epistaxis,fatigue,fever,gastric,heat,insomnia,itching,menstruation,mind,phlegm,qi,swelling,warm,yin
0,Mugwort,0.006407,,0.0,0.009995,,,0.010307,0.00805,,0.006407,0.012959,0.012155,0.0,,,,0.008737,,0.004324,0.004907,,0.005586,0.008159,,0.011298,
1,Lily Bulbs,,0.006852,,0.004136,,0.003304,0.003485,0.004858,0.00317,,,,,0.005783,,0.005517,0.002418,0.0,,,0.002684,,0.004867,,,0.003891
2,Codonopsis Root,,,,0.001162,0.002166,,0.001235,0.001404,,,,0.0,,0.0,,0.0,0.001233,,,,0.0,,0.0,,,
3,Cordyceps,,,0.004665,0.002905,0.0,0.002959,0.003065,0.003405,,,,,,0.002224,,,0.003062,,,,,0.004418,0.002593,,,0.002314
4,Poria Cocos,,,,0.001301,,,0.001378,0.0,,,0.0,0.0,,0.0,,,0.001376,0.003526,,,0.0,0.0,0.001556,,,0.001465
5,Licorice,0.0,,,0.001162,,,0.001235,0.001404,,0.0,,0.0,,,,0.0,0.001233,,,0.0,,0.002475,0.0,,0.0,0.001293
6,Wolfberry / Gojiberry,,,,0.002087,0.002166,0.002821,0.00221,0.002478,,,,,,,,,0.002205,,,,0.0,,0.0,,,0.002309
7,Tuber Fleeceflower,,,,0.003826,,,0.004021,,0.007189,,,,,,,,0.004026,0.008274,0.009432,,,,0.004475,,0.005269,0.004249
8,Cork-Tree Bark,,,0.0,0.006576,0.006805,,0.006376,,,,0.005335,0.005925,,0.007777,0.013229,,0.004563,,0.0,,,,0.006089,0.01297,,0.006846
9,Astragalus,,0.0,,0.001301,,,0.001378,0.0,,,,,,0.0,0.0,,,,,,,,0.001556,0.0,0.002635,0.001465


## Calculate word count of each predefined symptoms for every herbs

In [20]:
# Create a dictionary to store symptom word counts for each herb
symptom_word_counts = {herb_name: {symptom: 0 for symptom in general_symptoms} for herb_name in herb_names}

# Iterate through general symptoms and calculate TF-IDF scores for each herb
for symptom in general_symptoms:
    
    # Initialize an array for the word counts of this symptom in each herb
    symptom_word_counts_for_herb = [0] * len(herb_names)
    
    # Iterate through herbs and calculate TF-IDF scores for the symptom
    for i, herb_name in enumerate(herb_names):
        # Check if the herb has the symptom in its description
        if symptom in herb_desc[i].lower():

            # Update the word count for the symptom word within the herb's description
            symptom_word_counts[herb_name][symptom] = herb_desc[i].lower().count(symptom)

# Create a DataFrame for word counts
word_counts_df = pd.DataFrame.from_dict(symptom_word_counts, orient='index')

# Print the word counts table
print("\nWord Counts:")
print(word_counts_df)



Word Counts:
                       abdominal pain  beauty  bleeding  blood  \
Mugwort                             4       0        14      6   
Lily Bulbs                          0       6         0      3   
Codonopsis Root                     0       0         0     19   
Cordyceps                           0       0         4     14   
Poria Cocos                         0       0         0      2   
Licorice                            1       0         0      2   
Wolfberry / Gojiberry               0       0         0      5   
Tuber Fleeceflower                  0       0         0     17   
Cork-Tree Bark                      0       0         2      3   
Astragalus                          0       1         0      9   
Japanese Honeysuckle                0       1         0     10   
Monkfruit                           0       1         0      7   
Ginseng                             0       0         0      6   
Chinese Yam                         0       0         0     10

## Herb recommendation based on multiple symptoms

In [21]:
# User input on multiple symptoms
user_input = "yin fever cough bleeding"

# Split the user input into individual symptoms
user_symptoms = user_input.split()

# Initialize a dictionary to store the combined TF-IDF scores
combined_scores = {herb_name: 0 for herb_name in herb_names}

# Iterate through herbs and calculate the combined TF-IDF score for each herb
for herb_name in herb_names:
    # Initialize the combined score for this herb
    herb_combined_score = 0
    
    # Iterate through user symptoms
    for symptom in user_symptoms:
        # Check if the symptom is in the general symptoms list
        if symptom in general_symptoms:
            # Get the TF-IDF score for this symptom for the specific herb
            symptom_score = tfidf_df.loc[tfidf_df['Herb'] == herb_name][symptom].values[0]
            
            # If the symptom_score is NaN, set it to 0
            if np.isnan(symptom_score):
                symptom_score = 0
            
            # Add the symptom's TF-IDF score to the combined score for this herb
            herb_combined_score += symptom_score
    
    # Store the combined score for this herb in the dictionary
    combined_scores[herb_name] = herb_combined_score

# Sort the herbs by their combined scores in descending order
sorted_combined_scores = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

# Print the herbs and their combined scores
print("Combined Scores for User Input:", user_input)
for herb, score in sorted_combined_scores:
    print(f"{herb}: {score}")


Combined Scores for User Input: yin fever cough bleeding
Cork-Tree Bark: 0.020074741899999998
Cordyceps: 0.010383581499999999
Lily Bulbs: 0.0087493495
Mugwort: 0.0080502282
Wolfberry / Gojiberry: 0.0047874228
Tuber Fleeceflower: 0.0042488606
Chinese Arisaema: 0.0033330412
Japanese Honeysuckle: 0.0028477576
Licorice: 0.0026968772
Chinese Yam: 0.0026968772
Ginseng: 0.0021631404
Monkfruit: 0.0016070577
Poria Cocos: 0.0014653176
Astragalus: 0.0014653176
Codonopsis Root: 0.0014041298


## Check the relevant symptoms for each recommended herb

In [22]:
# Print the herbs, their combined scores, and relevant symptoms
print("Combined Scores for User Input:", user_input)
for herb, score in sorted_combined_scores:
    symptoms = [symptom for symptom in user_symptoms if tfidf_df.loc[tfidf_df['Herb'] == herb][symptom].values[0] > 0]
    
    if symptoms:
        print(f"{herb}: Score: {score}, Relevant Symptoms: {', '.join(symptoms)}")
    else:
        print(f"{herb}: Score: {score}, No relevant symptoms from the user input.")


Combined Scores for User Input: yin fever cough bleeding
Cork-Tree Bark: Score: 0.020074741899999998, Relevant Symptoms: yin, fever
Cordyceps: Score: 0.010383581499999999, Relevant Symptoms: yin, cough, bleeding
Lily Bulbs: Score: 0.0087493495, Relevant Symptoms: yin, cough
Mugwort: Score: 0.0080502282, Relevant Symptoms: cough
Wolfberry / Gojiberry: Score: 0.0047874228, Relevant Symptoms: yin, cough
Tuber Fleeceflower: Score: 0.0042488606, Relevant Symptoms: yin
Chinese Arisaema: Score: 0.0033330412, Relevant Symptoms: yin, cough
Japanese Honeysuckle: Score: 0.0028477576, Relevant Symptoms: yin
Licorice: Score: 0.0026968772, Relevant Symptoms: yin, cough
Chinese Yam: Score: 0.0026968772, Relevant Symptoms: yin, cough
Ginseng: Score: 0.0021631404, Relevant Symptoms: yin, cough
Monkfruit: Score: 0.0016070577, Relevant Symptoms: cough
Poria Cocos: Score: 0.0014653176, Relevant Symptoms: yin
Astragalus: Score: 0.0014653176, Relevant Symptoms: yin
Codonopsis Root: Score: 0.0014041298, Rele

## Upload the tf-idf data frame to database

In [23]:
# Convert the Pandas DataFrame to a dictionary with NaN values replaced by None
tfidf_dict = tfidf_df.iloc[:, 1:].where(pd.notna(tfidf_df.iloc[:, 1:]), 0).to_dict(orient='list')

# Reference to the "tfidf_data" table in the database
tfidf_ref = db.reference('tfidf_data')

# Set the tfidf_data in Firebase
tfidf_ref.set(tfidf_dict)

print("TF-IDF data (without herb names) has been saved to Firebase.")

# Extract herb names from the first column
herb_names = tfidf_df['Herb'].to_list()

# Reference to the "herb_names" table in the database
herb_names_ref = db.reference('herb_names')

# Set herb names in Firebase
herb_names_ref.set(herb_names)

print("Herb names have been saved to Firebase.")


TF-IDF data (without herb names) has been saved to Firebase.
Herb names have been saved to Firebase.


# Extras

## Calculate overall tf-idf scores for each predefined symptom

In [24]:
# Extract herb names and descriptions
herb_names = [herb['name'] for herb in herbs_desc_array]
herb_desc = [herb['description'] for herb in herbs_desc_array]

# General list of symptoms
general_symptoms = ['abdominal pain', 'beauty', 'bleeding', 'blood pressure', 'cancer', 'cold', 
                    'cooling', 'cough', 'cramps', 'dampness', 'diarrhea', 'dysmenorrhea', 'epistaxis', 
                    'fatigue', 'fever', 'gastric', 'heat', 'insomnia', 'itching', 'blood', 'phlegm', 
                    'qi', 'swelling', 'warm', 'yin']

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the vectorizer on herb descriptions
tfidf_matrix = tfidf_vectorizer.fit_transform(herb_desc)

# Initialize a dictionary to store TF-IDF scores for each symptom
symptom_tfidf_scores = {symptom: [] for symptom in general_symptoms}

# Iterate through general symptoms and calculate TF-IDF scores for each
for symptom in general_symptoms:
    # Create a new TF-IDF vectorizer for the specific symptom
    symptom_vectorizer = TfidfVectorizer(vocabulary=tfidf_vectorizer.vocabulary_)
    
    # Transform herb descriptions for the specific symptom
    symptom_tfidf_matrix = symptom_vectorizer.fit_transform([text for text in herb_desc if symptom in text.lower()])
    
    # Get the mean TF-IDF score for the symptom
    mean_tfidf_score = symptom_tfidf_matrix.mean()
    
    # Store the mean TF-IDF score for the symptom
    symptom_tfidf_scores[symptom] = mean_tfidf_score

# Print TF-IDF scores for each symptom
for symptom, tfidf_score in symptom_tfidf_scores.items():
    print(f"TF-IDF score for '{symptom}': {tfidf_score}")


TF-IDF score for 'abdominal pain': 0.004054893892076072
TF-IDF score for 'beauty': 0.00405662979337776
TF-IDF score for 'bleeding': 0.003986957437451437
TF-IDF score for 'blood pressure': 0.004586555239701282
TF-IDF score for 'cancer': 0.004318650395192957
TF-IDF score for 'cold': 0.004559445202172089
TF-IDF score for 'cooling': 0.003962234567772764
TF-IDF score for 'cough': 0.004632651706305606
TF-IDF score for 'cramps': 0.004054893892076072
TF-IDF score for 'dampness': 0.004168030062456483
TF-IDF score for 'diarrhea': 0.004693197598105292
TF-IDF score for 'dysmenorrhea': 0.004054893892076072
TF-IDF score for 'epistaxis': 0.004136791278404874
TF-IDF score for 'fatigue': 0.0042883988061070705
TF-IDF score for 'fever': 0.004354971826528949
TF-IDF score for 'gastric': 0.004480469401218013
TF-IDF score for 'heat': 0.004635446467434132
TF-IDF score for 'insomnia': 0.003932355007636429
TF-IDF score for 'itching': 0.0038845314244034153
TF-IDF score for 'blood': 0.004635903932377705
TF-IDF sc