In [None]:
pip install pyspark




In [None]:
import pandas as pd

In [None]:
file_path = r'/content/dataset.csv'
data = pd.read_csv(file_path)

In [None]:
data_info = data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   disease     99 non-null     object
 1   symptoms    99 non-null     object
 2   cures       99 non-null     object
 3   doctor      99 non-null     object
 4   risk level  99 non-null     object
dtypes: object(5)
memory usage: 4.0+ KB


In [None]:
data_head = data.head()

In [None]:
data_info, data_head

(None,
         disease                                           symptoms  \
 0           flu  fever,cough,sore throat,runny or stuffy nose,m...   
 1    bronchitis  cough,mucus production,shortness of breath,che...   
 2     pneumonia  fever,cough,shortness of breath,chest pain,fat...   
 3  heart attack  chest pain,shortness of breath,nausea,vomiting...   
 4        stroke  sudden weakness,numbness on one side of the bo...   
 
                                                cures  \
 0           over-the-counter medications,rest,fluids   
 1  antibiotics,over-the-counter medications,rest,...   
 2  antibiotics,over-the-counter medications,rest,...   
 3                         emergency medical services   
 4                         emergency medical services   
 
                         doctor     risk level  
 0    family doctor,urgent care      low (0.1%  
 1  family doctor,pulmonologist      low (0.5%  
 2  family doctor,pulmonologist  moderate (1%)  
 3                 cardio

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, regexp_replace
from pyspark.ml.feature import CountVectorizer, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
spark = SparkSession.builder \
    .appName("Disease Prediction") \
    .getOrCreate()

KeyboardInterrupt: 

In [None]:
data_path = "/content/dataset.csv"
data = spark.read.csv(data_path, header=True, inferSchema=True)


In [None]:
data = data.withColumn("symptoms", regexp_replace("symptoms", "[^a-zA-Z, ]", ""))
data = data.withColumn("symptoms", split(col("symptoms"), ","))

In [None]:
label_indexer = StringIndexer(inputCol="disease", outputCol="label").fit(data)
data = label_indexer.transform(data)

In [None]:
vectorizer = CountVectorizer(inputCol="symptoms", outputCol="features")

In [None]:
vectorizer.save('/content/saved_vectorizer1')

In [None]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)


In [None]:
rf_classifier = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=50)

In [None]:
pipeline = Pipeline(stages=[vectorizer, rf_classifier])

In [None]:
model = pipeline.fit(train_data)

In [None]:
predictions = model.transform(test_data)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, Tokenizer, CountVectorizer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.functions import trim, regexp_replace

spark = SparkSession.builder \
    .appName("Disease Prediction") \
    .getOrCreate()
data_path = "/content/dataset.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Clean the 'disease' column by removing special characters and trimming whitespace
df = df.withColumn("disease", trim(regexp_replace(df["disease"], "[^a-zA-Z0-9\\s]", "")))

# Remove rows with null values in 'disease' column
df = df.filter(df.disease.isNotNull())

# Ensure 'disease' column is a string
df = df.withColumn("disease", df["disease"].cast("string"))

# Tokenize the symptoms column
tokenizer = Tokenizer(inputCol="symptoms", outputCol="symptoms_tokens")

# Convert symptoms into a vector using CountVectorizer
vectorizer = CountVectorizer(inputCol="symptoms_tokens", outputCol="features")

# Index the 'disease' column to create labels
indexer = StringIndexer(inputCol="disease", outputCol="label")

# RandomForestClassifier for classification
rf = RandomForestClassifier(featuresCol="features", labelCol="label")

# Create a pipeline with the above stages
pipeline = Pipeline(stages=[indexer, tokenizer, vectorizer, rf])

# Split data into training and testing sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Fit the pipeline to the training data
model = pipeline.fit(train_data)

# Save the entire model to a location (replace 'path_to_save_model' with the path you want to save to)
model.save("/content/model1")


In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

In [None]:
print(f"Model Accuracy: {accuracy:.2f}")


In [None]:
model.save("/content/saved_model1")

In [None]:
spark.stop()

In [None]:
!pip install subprocess

[31mERROR: Could not find a version that satisfies the requirement subprocess (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for subprocess[0m[31m
[0m

In [None]:
!pip install streamlit



In [None]:

def write_streamlit_app():
    with open('pyspark_app.py', 'w') as f:
        f.write("""

import streamlit as st
from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel

# Create Spark session
spark = SparkSession.builder.master("local[*]").appName("ModelAPI").getOrCreate()

# Load the saved model
model = PipelineModel.load("file:///content/saved_model2")

# Streamlit UI
st.title("Disease Prediction Model")

# Input box for symptoms
symptoms_input = st.text_area("Enter Symptoms", "")

# Predict button
if st.button("Predict Disease"):
    if symptoms_input:
        # Prepare data for prediction (the symptoms should be in a DataFrame)
        data = [{"symptoms": symptoms_input}]
        df = spark.createDataFrame(data)

        # Make prediction using the model
        predictions = model.transform(df)

        # Get the predicted disease (output is an index from StringIndexer)
        prediction = predictions.select("prediction").first()[0]

        # Map the index back to the disease name using the StringIndexer model
        indexer_model = model.stages[0]  # StringIndexer is the first stage in the pipeline
        disease_labels = indexer_model.labels
        predicted_disease = disease_labels[int(prediction)]

        # Display the prediction
        st.write(f"Predicted Disease: {predicted_disease}")
    else:
        st.error("Please enter symptoms to predict.")
""")

In [None]:
write_streamlit_app()

In [None]:
import subprocess
import shutil

streamlit_path = shutil.which("streamlit")
if streamlit_path is None:
    raise FileNotFoundError("Streamlit is not installed or not in PATH.")
proc = subprocess.Popen([streamlit_path, 'run', 'pyspark_app.py'])

In [None]:
!streamlit run app.py &>/content/logs.txt &

In [None]:
!npm install localtunnel

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K
up to date, audited 23 packages in 2s
[1G[0K⠇[1G[0K
[1G[0K⠇[1G[0K3 packages are looking for funding
[1G[0K⠇[1G[0K  run `npm fund` for details
[1G[0K⠇[1G[0K
2 [33m[1mmoderate[22m[39m severity vulnerabilities

To address all issues (including breaking changes), run:
  npm audit fix --force

Run `npm audit` for details.
[1G[0K⠇[1G[0K

In [None]:
!wget -q -O - https://loca.lt/mytunnelpassword

34.125.70.239

In [None]:
!npx localtunnel --port 8501

[1G[0K⠙[1G[0Kyour url is: https://violet-bags-dream.loca.lt
