<a href="https://colab.research.google.com/github/lavanblavan/Sentiment-Analysis-training/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nltk
!pip install spacy
!pip install pyspark py4j
!pip install pyspark transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir /content/drive/MyDrive/DrugReview
# download dataset from kaggle
!kaggle datasets download mohamedabdelwahabali/drugreview
!unzip /content/drugreview.zip -d /content/drive/MyDrive/DrugReview
!chmod 600 /content/drive/MyDrive/DrugReview/kaggle.json

In [2]:
# import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "16g") \
    .config('spark.executor.memory','32g')\
    .appName('DrugReview') \
    .getOrCreate()

In [3]:
# change the data to pyspark structure
df = spark.read.csv("/content/drive/MyDrive/DrugReview/drug_review_train.csv", header=True, inferSchema=True,multiLine=True,escape='"')
df.show()

+---+----------+--------------------+--------------------+--------------------+------+-----------------+-----------+-------------+
|_c0|patient_id|            drugName|           condition|              review|rating|             date|usefulCount|review_length|
+---+----------+--------------------+--------------------+--------------------+------+-----------------+-----------+-------------+
|  0|     89879|        Cyclosporine|keratoconjunctivi...|"i have used rest...|   2.0|   April 20, 2013|         69|          147|
|  1|    143975|        Etonogestrel|       birth control|"my experience ha...|   7.0|   August 7, 2016|          4|          136|
|  2|    106473|            Implanon|       birth control|"this is my secon...|   1.0|     May 11, 2016|          6|          140|
|  3|    184526|         Hydroxyzine|             anxiety|"i recommend taki...|  10.0|   March 19, 2012|        124|          104|
|  4|     91587|       Dalfampridine|  multiple sclerosis|"i have been on a...|   9

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover, CountVectorizer, IDF,HashingTF,StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import RegexTokenizer
df =df.drop('words','filtered')
df.withColumn('review', regexp_replace('review', '[^a-zA-Z0-9\\s]', ''))
Tokenizer = RegexTokenizer(inputCol="review", outputCol="words", pattern="\\W")
df = Tokenizer.transform(df)
# Define sentiment labels based on rating
def sentiment_label(rating):
    if rating > 8:
        return 2  # Positive
    elif rating < 4:
        return 0  # Negative
    else:
        return 1  # Neutral

sentiment_udf = udf(sentiment_label, IntegerType())
df = df.withColumn("label", sentiment_udf(col("rating")))

# Stop words removal
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")
df = stopwords_remover.transform(df)



In [5]:
import nltk
nltk.download('all')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

In [6]:
df = df.drop('lemmatize')
def lemmatizer(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in text]
    return lemmatized_tokens
# Register UDF for BERT embedding
lemmatize = udf(lemmatizer, ArrayType(StringType()))

# Apply BERT embedding transformation
df = df.withColumn("lemmatize", lemmatize(col("filtered")))
df.show(truncate=False)

+---+----------+-------------------------+--------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----------------+-----------+-------------+------------------------------------------------------------------------------------------------------------

In [7]:
df = df.drop('rawFeatures','Features')
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=1500)
df = hashingTF.transform(df)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(df)
df = idfModel.transform(df)

df.select("label", "features").show(truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [8]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col



#  Convert label column to numerical index
indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
df = indexer.fit(df).transform(df)

# Convert features from Spark Vector to NumPy array
def vector_to_array(row):
    return (np.array(row["features"].toArray(), dtype=np.float32), row["indexedLabel"])

data = df.select("features", "indexedLabel").rdd.map(vector_to_array).collect()
X, y = zip(*data)  # Unzip into features and labels
X = np.array(X)
y = np.array(y)
# Define a simple neural network model
model = keras.Sequential([
    keras.layers.Dense(128, activation="relu", input_shape=(X.shape[1],)),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(len(set(y)), activation="softmax")  # Multi-class output
])

# Compile the model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
# Train the model
model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)
# Make predictions
y_pred = np.argmax(model.predict(X), axis=1)

# Convert NumPy arrays back to PySpark DataFrame
results = list(zip(y.tolist(), y_pred.tolist()))
df_results = spark.createDataFrame(results, ["actual", "predicted"])

# Evaluate Accuracy
accuracy = df_results.filter(col("actual") == col("predicted")).count() / df_results.count()
print(f"Test Accuracy: {accuracy:.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2771/2771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 8ms/step - accuracy: 0.5985 - loss: 0.8667 - val_accuracy: 0.6586 - val_loss: 0.7638
Epoch 2/10
[1m2771/2771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7ms/step - accuracy: 0.7297 - loss: 0.6323 - val_accuracy: 0.6926 - val_loss: 0.7321
Epoch 3/10
[1m2771/2771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 8ms/step - accuracy: 0.8300 - loss: 0.4233 - val_accuracy: 0.7161 - val_loss: 0.7795
Epoch 4/10
[1m2771/2771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 8ms/step - accuracy: 0.9069 - loss: 0.2526 - val_accuracy: 0.7121 - val_loss: 0.9284
Epoch 5/10
[1m2771/2771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 7ms/step - accuracy: 0.9452 - loss: 0.1581 - val_accuracy: 0.7204 - val_loss: 1.1205
Epoch 6/10
[1m2771/2771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 8ms/step - accuracy: 0.9639 - loss: 0.1068 - val_accuracy: 0.7231 - val_loss: 1.3665
Epoch 7/10

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession



# Assuming df is already preprocessed with TF-IDF features

# Convert label column to numerical index
indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
df = indexer.fit(df).transform(df)

# Split data into training and test sets
train, test = df.randomSplit([0.8, 0.2], seed=1234)

# Define Neural Network architecture
input_size = 1500  # Same as numFeatures in HashingTF
hidden_layer_1 = 512  # Example hidden layer with 20 neurons
hidden_layer_2 = 256   # Example second hidden layer with 20 neurons
hidden_layer_3 = 128 #Example third layer with 10 neurons
output_size = df.select("indexedLabel").distinct().count()  # Number of classes
layers = [input_size, hidden_layer_1, hidden_layer_2,hidden_layer_3, output_size]

# Define the Multilayer Perceptron Classifier
mlp = MultilayerPerceptronClassifier(featuresCol="features", labelCol="indexedLabel", layers=layers, blockSize=128, seed=1234)

# Train the model
model = mlp.fit(train)

# Make predictions
test_predictions = model.transform(test)

evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(test_predictions)
print(f"Test Accuracy: {accuracy}")


