In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import unicodedata
from textblob import TextBlob
from nltk.corpus import stopwords
from collections import Counter
import warnings; warnings.simplefilter('ignore')
import nltk
import string
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

In [None]:
data = pd.read_csv("/Users/madhuranirale/Desktop/HEC-T1/Quinten/raw_data_healthcare.csv")

In [None]:
data.drop(columns=['text_index'], inplace=True)

In [None]:
data

## Filtering required data

In [None]:
split_data = data['medication'].str.split(r'(?i)for',1,expand=True)
data['drug'] = split_data[0]
data['illness'] = split_data[1]

# Define the regex pattern for filtering
pattern = r'(?i)Ulcerative Colitis|Crohn\'s Disease'

# Use str.contains() to filter the data
data = data[data['illness'].str.contains(pattern, case=False)]

In [None]:
data

In [None]:
data.to_csv('shortlisted.csv')

## EDA

In [None]:
data.isna().sum()
#Valid ratings = 281-14 = 267

In [None]:
data.nunique()

In [None]:
# This barplot show the count of illnesses the people are suffering.
cond = dict(data['illness'].value_counts())
top_condition = list(cond.keys())[0:10]
values = list(cond.values())[0:10]
sns.set(style = 'darkgrid', font_scale = 1.3)
plt.rcParams['figure.figsize'] = [7, 5]
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
sns_ = sns.barplot(x = top_condition, y = values, palette = 'winter')
sns_.set_title("Conditions vs count",fontsize=12)
sns_.set_xlabel("Illness",fontsize=10)
sns_.set_ylabel("Count",fontsize=10)

plt.xticks(rotation=90)
plt.show()

In [None]:
#Most frequently reviewed drugs
top_drugs = data['drug'].value_counts()
sns.barplot(x=top_drugs.index, y=top_drugs.values)
plt.title('Reviewed Drugs Count', fontsize=12)
plt.xticks(rotation=90)
plt.xlabel('Drug Name',fontsize=10)
plt.ylabel('Count',fontsize=10)
plt.show()

In [None]:
# This barplot shows the top drugs with the 10/10 rating

# Setting the Parameter
sns.set(font_scale = 1.2, style = 'darkgrid')
plt.rcParams['figure.figsize'] = [7, 5]

rating = dict(data.loc[data.rate == 10, "drug"].value_counts())
drugname = list(rating.keys())
drug_rating = list(rating.values())
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
sns_rating = sns.barplot(x = drugname, y = drug_rating)

sns_rating.set_title('Top drugs with 10/10 rating',fontsize=12)
sns_rating.set_ylabel("Number of Ratings",fontsize=10)
sns_rating.set_xlabel("Drug Names",fontsize=10)
plt.setp(sns_rating.get_xticklabels(), rotation=90);

In [None]:
# Count the occurrences of each rating
rating_counts = data['rate'].value_counts().sort_index()

# Create a donut chart with a legend
fig, ax = plt.subplots()
wedges, texts, autotexts = ax.pie(rating_counts, labels=None, autopct='', startangle=140, wedgeprops=dict(width=0.3))

# Calculate percentages and set labels inside the fractions
total_ratings = sum(rating_counts)
percentages = [(count / total_ratings * 100) for count in rating_counts]

labels = [f"{rating} ({percent:.1f}%)" for rating, percent in zip(rating_counts.index, percentages)]

# Set labels and rotate them
ax.legend(wedges, labels, title="Ratings", loc="center left", bbox_to_anchor=(0.9,0.7),prop={'size': 8},title_fontsize=10)
plt.setp(autotexts, size=10, weight="bold")

plt.title('Distribution of Ratings', fontsize=12)
plt.axis('equal')  # Equal aspect ratio ensures that the donut chart is clear.

plt.show()


In [None]:
# A countplot of the ratings so we can see the distribution of the ratings
plt.rcParams['figure.figsize'] = [6, 4]
sns.set(font_scale=1.2, style='whitegrid')

sns_plot = sns.distplot(data['rate'], color='skyblue')
sns_plot.set(xlim=(0, 12))

sns_plot.set_title('Distribution of Ratings')
sns_plot.set_xlabel("Rating")

plt.show()


## Data preprocessing

In [None]:
data = pd.read_csv('shortlisted.csv')
data = data.drop(columns=['Unnamed: 0'])
data

In [None]:
data.loc[data['rate'] >= 5, 'review_sentiment'] = 1
data.loc[data['rate'] < 5, 'review_sentiment'] = 0

data['review_sentiment'].value_counts()

In [None]:
def remove_noise(review): 
    # changing to lower case
    lower = review.str.lower()
    
    # Replacing the repeating pattern of &#039;
    pattern_remove = lower.str.replace("&#039;", "")
    
    # Removing all the special Characters
    special_remove = pattern_remove.str.replace(r'[^\w\d\s]',' ')
    
    # Removing all the non ASCII characters
    ascii_remove = special_remove.str.replace(r'[^\x00-\x7F]+',' ')
    
    # Removing the leading and trailing Whitespaces
    whitespace_remove = ascii_remove.str.replace(r'^\s+|\s+?$','')
    
    # Replacing multiple Spaces with Single Space
    multiw_remove = whitespace_remove.str.replace(r'\s+',' ')
    
    # Replacing Two or more dots with one
    dataframe = multiw_remove.str.replace(r'\.{2,}', ' ')
    
    return dataframe

In [None]:
data

In [None]:
data['review_clean'] = remove_noise(data['comment'])
data

In [None]:
# Removing the stopwords
stop_words = set(stopwords.words('english'))
data['review_clean'] = data['review_clean'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

In [None]:
import spacy

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Define a function for lemmatization using spaCy
def lemmatize_with_spacy(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ if token.lemma_ != "-PRON-" else token.text for token in doc])
    return lemmatized_text

# Apply lemmatization to your DataFrame
data['review'] = data['review_clean'].apply(lemmatize_with_spacy)
data


In [None]:
def sentiment(review):
    # Sentiment polarity of the reviews
    pol = []
    for i in review:
        analysis = TextBlob(i)
        pol.append(analysis.sentiment.polarity)
    return pol

In [None]:
data['sentiment'] = sentiment(data['comment'])

In [None]:
data['sentiment_clean'] = sentiment(data['review_clean'])

In [None]:
# Cleaning the reviews without removing the stop words and using snowball stemmer
data['review_clean_ss'] = remove_noise(data['comment'])
data['sentiment_clean_ss'] = sentiment(data['review_clean_ss'])

In [None]:
#Word count in each review
data['count_word']=data["review_clean_ss"].apply(lambda x: len(str(x).split()))

#Unique word count 
data['count_unique_word']=data["review_clean_ss"].apply(lambda x: len(set(str(x).split())))

#Letter count
data['count_letters']=data["review_clean_ss"].apply(lambda x: len(str(x)))

#punctuation count
data["count_punctuations"] = data["comment"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

#upper case words count
data["count_words_upper"] = data["comment"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

#title case words count
data["count_words_title"] = data["comment"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

#Number of stopwords
data["count_stopwords"] = data["comment"].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))

#Average length of the words
data["mean_word_len"] = data["review_clean_ss"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [None]:
data