# DR VAKU® Swadesi Pulse Oximeter - Analysis

https://www.amazon.in/DR-Oximeter-Fingertip-Monitor-Approved/dp/B08D3KXF9Y/ref=cm_cr_arp_d_product_top?ie=UTF8&th=1

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import spacy
import string
from wordcloud import WordCloud, STOPWORDS

%matplotlib inline

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
reviews = pd.read_csv('OxiReviews.csv')

In [None]:
reviews.head()

In [None]:
reviews.info()

# Rating Graph

In [None]:
import seaborn as sns
count = reviews['rating'].value_counts()
sns.barplot(x=count.index, y = count, color = '#a2c816')

In [None]:
reviews['rating'].value_counts()

In [None]:
reviews = reviews.dropna(axis=0)

In [None]:
reviews.isnull().sum()

In [None]:
body = reviews['body']
rv = " ".join(body)
rv

In [None]:
#To remove punctuation marks
no_punc_text = rv.translate(str.maketrans('', '', string.punctuation))

In [None]:
no_punc_text

In [None]:
#To Create tokens
from nltk.tokenize import word_tokenize
text_tokens = word_tokenize(no_punc_text)
print(text_tokens[0:50])

In [None]:
len(text_tokens)

In [None]:
#Remove Stopwords by NLTK
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
my_stop_words = stopwords.words('english')
li = ['the','The', 'in','I', 'This','its','it','It', 'dont', 'a', 'He', 'Im', 'as']
for i in li:
    my_stop_words.append(i)
no_stop_tokens = [word for word in text_tokens if not word in my_stop_words]
print(no_stop_tokens[:40])

In [None]:
#Normalization
lower_words = [x.lower() for x in no_stop_tokens]
print(lower_words[0:25])

In [None]:
nlp = spacy.load('en_core_web_sm') 

In [None]:
doc = nlp(' '.join(no_stop_tokens))
print(doc[0:40])

In [None]:
lemmas = [token.lemma_ for token in doc]
print(lemmas[0:25])

In [None]:
#Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(lemmas)

In [None]:
print(vectorizer.vocabulary_)

In [None]:
import seaborn as sns
dic = vectorizer.vocabulary_
data = pd.DataFrame(dic.items(), columns=['Date', 'DateValue'])
data0=data.sort_values('DateValue',ascending = False)[:10]
plt.figure(figsize = (15, 8))
sns.barplot(x='Date', y="DateValue", data = data0, color='#a2c816')


In [None]:
data0.to_csv(r'wordsFinalFreq.csv', index=False)
data0.head(10)

In [None]:
#Create Wordcloud
from wordcloud import WordCloud, STOPWORDS

def plot_cloud(wordcloud):
    plt.figure(figsize=(40, 30))
    plt.imshow(wordcloud) 
    plt.axis("off");

In [None]:
stopwords = STOPWORDS
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2', collocations=False, stopwords = STOPWORDS).generate(rv)
plot_cloud(wordcloud)

In [None]:
wordcloud.to_file("wordCLFinal.png")

In [None]:
df = []
for i in reviews['rating']:
    if i<=2:
        i='Negative'
    elif i == 3:
        i='Neutral'
    elif i >=4:
        i='Positive'
    df.append(i)
df
reviews['Classify'] = pd.DataFrame(df)

In [None]:
reviews.head(1000)

In [None]:
pct_Classify = reviews['Classify'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pct_Classify

#  Vader Sentiment Analysis

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
def classify_compound(text, threshold=0.33):
     
    # initialize VADER
    sid = SentimentIntensityAnalyzer()
     
    # Calling the polarity_scores method on sid and passing in the text
    # outputs a dictionary with negative, neutral, positive, and compound scores for the input text
    scores = sid.polarity_scores(text)
     
    # get compound score
    score = scores['compound']
     
    # translate the score into the correcponding input according to the threshold
    if score <= -threshold: return 'Negative'
    elif score >= threshold: return 'Positive'
    else: return 'Neutral'

In [None]:
reviews['compound_sentiment'] = reviews.body.apply(lambda text: classify_compound(text))

In [None]:
reviews.head(10)

In [None]:
pct_cs = reviews['compound_sentiment'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pct_cs

In [None]:
plt.hist(reviews['compound_sentiment'],color='#a2c816')

In [None]:
label = [ 'Positive', 'Neutral', 'Negative']
plt.pie(reviews['compound_sentiment'].value_counts(), labels=label)
plt.show() 

# TextBlob

In [None]:
from textblob import TextBlob

In [None]:
pol = lambda x : TextBlob(x).sentiment.polarity
sub = lambda x : TextBlob(x).sentiment.subjectivity

In [None]:
reviews['Polarity'] = reviews['body'].apply(pol)
reviews['Subjectivity'] = reviews['body'].apply(sub)

In [None]:
reviews.head()

In [None]:
#Polarity lies between [-1,1]
def getAnalysis(score):
      if score < 0:
        return 'Negative'
      elif score == 0:
        return 'Neutral'
      else:
        return 'Positive'

In [None]:
reviews['TextBlob'] = reviews['Polarity'].apply(getAnalysis)

In [None]:
reviews.head()

In [None]:
pct_tb = reviews['TextBlob'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pct_tb

In [None]:
sns.histplot(reviews['TextBlob'], color='#a2c816')

In [None]:
plt.pie(reviews['TextBlob'].value_counts(), labels=reviews['TextBlob'].unique())
plt.show()

# We have used three classifier here is the comparison of them

In [None]:
pct_Classify #Through Ratings

In [None]:
pct_cs #Vedar Classifier

In [None]:
pct_tb #TextBlob Classifier

In [None]:
import re
def text_cleaner (text):
    clean_data= re.sub('[^a-zA-Z]', " ", text) 
    clean_data= clean_data.lower()
    clean_data = clean_data.split()
    clean_data= ' '.join(clean_data)
    return clean_data

In [None]:
reviews.dropna(subset = ["Classify"], inplace=True, axis=0)

In [None]:
reviews.isnull().sum()

In [None]:

reviews["Clean_review"]=reviews["body"].apply(lambda x: text_cleaner(str(x)))
reviews["sentiment"] = reviews["Classify"].map({'Positive': 1, 'Negative':0, 'Neutral':0})
reviews.head()

In [None]:
reviews["sentiment"].value_counts()

# Modeling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(binary=True)

In [None]:
x = cv.fit_transform(reviews["Clean_review"]).toarray()

In [None]:
y = reviews["sentiment"].values

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, precision_score, recall_score
from sklearn.metrics import roc_curve, auc

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=10)

In [None]:
from sklearn.naive_bayes import BernoulliNB , GaussianNB, MultinomialNB

In [None]:
bnb = BernoulliNB()
gnb= GaussianNB()
mnb= MultinomialNB()

# BernoulliNB

In [None]:
bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='#a2c816')
print(auc(fpr, tpr))

# GaussianNB

In [None]:
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='#a2c816')
print(auc(fpr, tpr))

# MultinomialNB

In [None]:
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='#a2c816')
print(auc(fpr, tpr))

# LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg = LogisticRegression()

In [None]:
lg.fit(X_train, y_train)
y_pred = lg.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='#a2c816')
print(auc(fpr, tpr))

# Support Vector Classifier

In [None]:
from sklearn.svm import SVC
sv = SVC()

In [None]:
sv.fit(X_train, y_train)
y_pred = sv.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='#a2c816')
print(auc(fpr, tpr))

In [None]:
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier

In [None]:
rf= RandomForestClassifier(n_estimators=300 , random_state=50)
ad= AdaBoostClassifier(n_estimators=300 , random_state=50)
gb = GradientBoostingClassifier(n_estimators=300 , random_state=5)
etc= ExtraTreesClassifier(n_estimators=300 , random_state=5)

In [None]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {'n_estimators': [100,150,200,300,500],
#               'random_state': [10,20,30,40,50,60]}
# grid_rf = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, refit = True, verbose = 3)
# grid_ad = GridSearchCV(AdaBoostClassifier(), param_grid=param_grid, refit = True, verbose = 3)
# grid_gb = GridSearchCV(GradientBoostingClassifier(), param_grid=param_grid, refit = True, verbose = 3)
# grid_etc = GridSearchCV(ExtraTreesClassifier(), param_grid=param_grid, refit = True, verbose = 3)

In [None]:
# grid_rf.fit(X_train, y_train)

In [None]:
#grid_rf.best_params_

# RandomForestClassifier

In [None]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='#a2c816')
print(auc(fpr, tpr))

# AdaBoostClassifier

In [None]:
ad.fit(X_train, y_train)
y_pred = ad.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='#a2c816')
print(auc(fpr, tpr))

# GradientBoostingClassifier

In [None]:
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='#a2c816')
print(auc(fpr, tpr))

# ExtraTreesClassifier

In [None]:
etc.fit(X_train, y_train)
y_pred = etc.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='#a2c816')
print(auc(fpr, tpr))

# DecisionTreeClassifier 

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier()

In [None]:
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='#a2c816')
print(auc(fpr, tpr))

# VotingClassifier

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
vote_hard = VotingClassifier(estimators=[('lr', lg), ('bnb', bnb), ('ad', ad), ('mnb', mnb)], voting='hard')

In [None]:
vote_hard.fit(X_train, y_train)

In [None]:
vote_hard.fit(X_train, y_train)
y_pred = vote_hard.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='#a2c816')
print(auc(fpr, tpr))

In [None]:
vote_soft = VotingClassifier(estimators=[('lr', lg), ('bnb', bnb), ('ad', ad), ('mnb', mnb)], voting='soft')

In [None]:
vote_soft.fit(X_train, y_train)
y_pred = vote_soft.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='#a2c816')
print(auc(fpr, tpr))

In [None]:
text = 'Not Good'
clean_data= text_cleaner(text)
clean_data
cv1=cv.transform([clean_data])

In [None]:
bnb.predict(cv1)

In [None]:
ad.predict(cv1)

In [None]:
mnb.predict(cv1)

In [None]:
lg.predict(cv1)

In [None]:
vote_soft.predict(cv1)

In [None]:
# import pickle

In [None]:
# file = open("F:ExcelR/Project 88/votingClassifier.pkl","wb")
# pickle.dump(vote_soft , file)

In [None]:
# file2 = open("F:ExcelR/Project 88/vector.pkl","wb")
# pickle.dump(cv , file2)