In [2]:
# Step 1: Import Libraries
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [3]:
#loading the data 

In [4]:

file_path = r"C:\Users\hemch\Downloads\Healthcare Projects\Dataset.csv"
data = pd.read_csv(file_path)

In [5]:
data.head()

Unnamed: 0,Review_Text,Rating
0,I have mixed feelings about my experience.,4
1,The staff was caring and attentive. I couldn't...,5
2,I have mixed feelings about my experience.,5
3,I have mixed feelings about my experience.,5
4,The healthcare provider was excellent. I had a...,3


## Data Preprocessing

**Text Cleaning**: Tokenization, stop-word removal, and lemmatization.

**Data Transformation**: Converting text data into a suitable format for analysis.

In [6]:
# Ensure all review text entries are strings
data['Review_Text'] = data['Review_Text'].astype(str)

In [8]:
def preprocess_text_sklearn(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    text = text.strip()
    return text

In [9]:
data['Cleaned_Review'] = data['Review_Text'].apply(preprocess_text_sklearn)

## Sentiment Labeling

**Define Sentiment Categories**: Based on the Rating column, categorize reviews into positive, negative, and neutral sentiments.

In [11]:
def label_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'



In [12]:
# Apply sentiment labeling
data['Sentiment'] = data['Rating'].apply(label_sentiment)

## Model Training

**LOGISTIC REGRESSION MODEL**

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data['Cleaned_Review'], data['Sentiment'], test_size=0.2, random_state=42)

In [15]:
# Transform Text Data into TF-IDF Features
tfidf = TfidfVectorizer(max_features=5000, stop_words=ENGLISH_STOP_WORDS)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [16]:
#Train the Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

LogisticRegression()

In [17]:
y_pred = model.predict(X_test_tfidf)

In [18]:
report = classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive'])
print(report)

              precision    recall  f1-score   support

    negative       0.38      0.39      0.38        77
     neutral       0.00      0.00      0.00        35
    positive       0.43      0.59      0.50        88

    accuracy                           0.41       200
   macro avg       0.27      0.33      0.29       200
weighted avg       0.34      0.41      0.37       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Insights:

**Negative Sentiment:**

Precision: 38%  
Recall: 39%  
F1-Score: 38%  


**Neutral Sentiment:**

Precision: 0%  
Recall: 0%  
F1-Score: 0%  


**Positive Sentiment:**

Precision: 43%  
Recall: 59%  
F1-Score: 50%  

**Overall Accuracy: 41%**

The model shows moderate performance in identifying negative and positive sentiments but struggles significantly with neutral sentiments, as indicated by the 0% precision and recall for neutral reviews.

## Applying with multiple Models 

**Naive Bayes**

**Support Vector Machine (SVM)**

**Random Forest**

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [24]:
models = {
    'Naive Bayes': MultinomialNB(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier()
}


In [25]:
for model_name, model in models.items():
    print(f"Evaluating {model_name}")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    report = classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive'])
    print(report)

Evaluating Naive Bayes
              precision    recall  f1-score   support

    negative       0.39      0.34      0.36        77
     neutral       0.00      0.00      0.00        35
    positive       0.43      0.66      0.52        88

    accuracy                           0.42       200
   macro avg       0.28      0.33      0.30       200
weighted avg       0.34      0.42      0.37       200

Evaluating Support Vector Machine
              precision    recall  f1-score   support

    negative       0.38      0.39      0.38        77
     neutral       0.00      0.00      0.00        35
    positive       0.43      0.59      0.50        88

    accuracy                           0.41       200
   macro avg       0.27      0.33      0.29       200
weighted avg       0.34      0.41      0.37       200

Evaluating Random Forest
              precision    recall  f1-score   support

    negative       0.38      0.39      0.38        77
     neutral       0.00      0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Insights and Model Performance Comparison

**Naive Bayes**

Achieved the highest accuracy of 42%.  
Performed best in predicting positive sentiments with an F1-score of 52%.  
Struggled with neutral sentiments, similar to other models.  

**SVM and Random Forest**

All had an accuracy of 41%.  
Showed similar performance metrics, with F1-scores of around 50% for positive sentiments.  
Failed to effectively predict neutral sentiments  

## Conclusion


The sentiment analysis reveals a consistent challenge across all models in accurately predicting neutral sentiments. Positive sentiments were identified with moderate success, while negative sentiments had lower precision and recall. Among the models, Naive Bayes slightly outperformed the others in overall accuracy and the prediction of positive sentiments.

## Recommendations

**Data Augmentation:**

Increase the dataset size, especially for neutral reviews, to provide more balanced training data.  
Collect more labeled data to improve model training.  

**Advanced Models**

Explore more sophisticated models such as BERT or other transformer-based models that might capture nuances better than traditional classifiers.  

**Feature Engineering**

Incorporate additional text processing techniques, such as n-grams (bigrams or trigrams) and word embeddings.  
Experiment with different feature extraction methods and hyperparameter tuning.  

**Continuous Improvement**

Implement a feedback loop to continuously collect new patient reviews and update the analysis.  
Regularly refine and retrain models with new data to improve accuracy and relevance.  

**Summary**

This sentiment analysis project provides a foundation for understanding patient feedback on healthcare services. Although the models showed moderate success in predicting sentiments, there is significant room for improvement. By following the recommendations, the analysis can be refined to provide more accurate and actionable insights, ultimately aiding healthcare providers in enhancing their services.