# Stage 4: Advanced Analysis
- Sentiment Analysis
- Text Classification

### Importing Libraries

In [1]:
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import pipeline

### Files Import

In [2]:
data = pd.read_pickle('data_clean_summarized.pkl')
data.reset_index(drop=True, inplace=True)

data

Unnamed: 0,text,rating,summary
0,My boyfriend bought this for our house and ou...,1,
1,This is a good small microwave for someone in...,5,This is a good small microwave for someone in ...
2,This microwave is not powerful and heats food...,1,This microwave is not powerful and heats food ...
3,My mother has Macular Degeneration Disease Th...,5,My mother has Macular Degeneration Disease Thi...
4,This microwave has been barely used at all an...,1,This microwave has been barely used at all and...
...,...,...,...
3862,I love this thing mostly cuz it came with Alexa,4,I love this thing mostly cuz it came with Alexa
3863,This microwave has stopped working everytime ...,5,This microwave has stopped working everytime w...
3864,Alexa stopped working on the microwave a week...,1,Alexa stopped working on the microwave a week ...
3865,I received this as a Christmas gift and absol...,5,I received this as a Christmas gift and absolu...


In [3]:
# from Amazon website (https://www.amazon.com/AmazonBasics-Microwave-Small-Works-Alexa/dp/B07894S727)
specs = pd.read_csv("specs.csv")
specifications = list(specs["Specifications"])                  # extract specifications headers from specs.csv

# from Mr Right Ideas website (https://www.mrright.in/ideas/appliances/microwave/microwave-oven-components-and-their-functions/)
comps = pd.read_csv("comps.csv")
components = list(dict.fromkeys(comps["Components"]))           # extract component headers from comps.csv
comps_renamed = list(dict.fromkeys(comps["Comps_renamed"]))     # extract renamed components from comps.csv

# from wiki_scraper.py
wiki_adjusted = pd.read_csv("wiki_adjusted.csv")
wiki_adjusted = list(wiki_adjusted['Keywords'])                 # extract sustainability keywords from wiki_adjusted.csv

others = ['Packaging', 'Delivery']
specifications += others

data['comps_mention'] = 'na'

### Components Mentioned in Reviews
- With Sentiment Analysis

In [4]:
for i in range(data.shape[0]):
    for j in comps_renamed:
        if j in data['text'][i]:
            data['comps_mention'][i] = j

data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['comps_mention'][i] = j


Unnamed: 0,text,rating,summary,comps_mention
0,My boyfriend bought this for our house and ou...,1,,panel
1,This is a good small microwave for someone in...,5,This is a good small microwave for someone in ...,na
2,This microwave is not powerful and heats food...,1,This microwave is not powerful and heats food ...,na
3,My mother has Macular Degeneration Disease Th...,5,My mother has Macular Degeneration Disease Thi...,na
4,This microwave has been barely used at all an...,1,This microwave has been barely used at all and...,na
...,...,...,...,...
3862,I love this thing mostly cuz it came with Alexa,4,I love this thing mostly cuz it came with Alexa,na
3863,This microwave has stopped working everytime ...,5,This microwave has stopped working everytime w...,na
3864,Alexa stopped working on the microwave a week...,1,Alexa stopped working on the microwave a week ...,na
3865,I received this as a Christmas gift and absol...,5,I received this as a Christmas gift and absolu...,na


In [5]:
# Create quick lambda functions to find the polarity and subjectivity of each routine
from textblob import TextBlob

count_result = []
for comps in comps_renamed:
    count = [data[data.comps_mention == comps].shape[0]]
    count_result += count

pol = lambda x: TextBlob(x).sentiment.polarity
data['polarity'] = data['summary'].apply(pol)

pol_count  = lambda x: 1  if x > 0 else -1
data['pol_count'] = data['polarity'].apply(pol_count)
display(data)

sentiment_list = []
for comps in comps_renamed:
    data_count = data[data.comps_mention == comps]
    sen_ana = data_count['pol_count'].sum()
    if sen_ana > 0:
        sen_ana = 'positive'
    else:
        sen_ana = 'negative'
    sentiment_list += [sen_ana]
    # df_chart['sentiments'] = [df_chart['labels'] == comps]

# data['pol_count'].sum()

chart = {'labels': comps_renamed,
        'values': count_result,
        'sentiments': sentiment_list}

df_chart = pd.DataFrame(chart, columns=['labels','values', 'sentiments'])

df_chart

Unnamed: 0,text,rating,summary,comps_mention,polarity,pol_count
0,My boyfriend bought this for our house and ou...,1,,panel,0.000000,-1
1,This is a good small microwave for someone in...,5,This is a good small microwave for someone in ...,na,0.225000,1
2,This microwave is not powerful and heats food...,1,This microwave is not powerful and heats food ...,na,0.050000,1
3,My mother has Macular Degeneration Disease Th...,5,My mother has Macular Degeneration Disease Thi...,na,0.312500,1
4,This microwave has been barely used at all an...,1,This microwave has been barely used at all and...,na,-0.216667,-1
...,...,...,...,...,...,...
3862,I love this thing mostly cuz it came with Alexa,4,I love this thing mostly cuz it came with Alexa,na,0.500000,1
3863,This microwave has stopped working everytime ...,5,This microwave has stopped working everytime w...,na,0.000000,-1
3864,Alexa stopped working on the microwave a week...,1,Alexa stopped working on the microwave a week ...,na,0.000000,-1
3865,I received this as a Christmas gift and absol...,5,I received this as a Christmas gift and absolu...,na,0.243750,1


Unnamed: 0,labels,values,sentiments
0,transformer,0,negative
1,magnetron,1,negative
2,waveguide,3,negative
3,fan,26,positive
4,case,18,positive
5,turntable,42,positive
6,panel,51,positive
7,socket,1,positive
8,alexa,154,positive
9,door,123,positive


In [6]:
# Data check

data_selected = data[data.comps_mention == df_chart.labels[2]] # check number accordingly
data_selected.reset_index(drop=True, inplace=True)

data_selected

Unnamed: 0,text,rating,summary,comps_mention,polarity,pol_count
0,I had this microwave for less than a year and...,1,I had this microwave for less than a year and ...,waveguide,-0.059028,-1
1,Disclaimer We have had this microwave for abo...,1,Disclaimer We have had this microwave for abou...,waveguide,0.195833,1
2,Broke only after a few months of use It looks...,1,Broke only after a few months of use It looks ...,waveguide,-0.1,-1


### Sustainability Mentioned in Reviews
- With Sentiment Analysis

In [7]:
data['sus_mention'] = 'na'
wiki_adjusted = ['waste', 'eneryg', 'environmental', 'material']
for i in range(data.shape[0]):
    for j in wiki_adjusted:
        if j in data['text'][i]:
            data['sus_mention'][i] = j

data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sus_mention'][i] = j


Unnamed: 0,text,rating,summary,comps_mention,polarity,pol_count,sus_mention
0,My boyfriend bought this for our house and ou...,1,,panel,0.000000,-1,na
1,This is a good small microwave for someone in...,5,This is a good small microwave for someone in ...,na,0.225000,1,na
2,This microwave is not powerful and heats food...,1,This microwave is not powerful and heats food ...,na,0.050000,1,na
3,My mother has Macular Degeneration Disease Th...,5,My mother has Macular Degeneration Disease Thi...,na,0.312500,1,na
4,This microwave has been barely used at all an...,1,This microwave has been barely used at all and...,na,-0.216667,-1,na
...,...,...,...,...,...,...,...
3862,I love this thing mostly cuz it came with Alexa,4,I love this thing mostly cuz it came with Alexa,na,0.500000,1,na
3863,This microwave has stopped working everytime ...,5,This microwave has stopped working everytime w...,na,0.000000,-1,na
3864,Alexa stopped working on the microwave a week...,1,Alexa stopped working on the microwave a week ...,na,0.000000,-1,na
3865,I received this as a Christmas gift and absol...,5,I received this as a Christmas gift and absolu...,na,0.243750,1,na


In [8]:
# Create quick lambda functions to find the polarity and subjectivity of each routine
from textblob import TextBlob

count_result = []
for sus in wiki_adjusted:
    count = [data[data.sus_mention == sus].shape[0]]
    count_result += count

pol = lambda x: TextBlob(x).sentiment.polarity
data['polarity'] = data['summary'].apply(pol)

pol_count  = lambda x: 1  if x > 0 else -1
data['pol_count'] = data['polarity'].apply(pol_count)
display(data)

sentiment_list = []
for sus in wiki_adjusted:
    data_count = data[data.sus_mention == sus]
    sen_ana = data_count['pol_count'].sum()
    if sen_ana > 0:
        sen_ana = 'positive'
    else:
        sen_ana = 'negative'
    sentiment_list += [sen_ana]
    # df_chart['sentiments'] = [df_chart['labels'] == comps]

# data['pol_count'].sum()

chart = {'labels': wiki_adjusted,
        'values': count_result,
        'sentiments': sentiment_list}

df_chart = pd.DataFrame(chart, columns=['labels','values', 'sentiments'])

df_chart

Unnamed: 0,text,rating,summary,comps_mention,polarity,pol_count,sus_mention
0,My boyfriend bought this for our house and ou...,1,,panel,0.000000,-1,na
1,This is a good small microwave for someone in...,5,This is a good small microwave for someone in ...,na,0.225000,1,na
2,This microwave is not powerful and heats food...,1,This microwave is not powerful and heats food ...,na,0.050000,1,na
3,My mother has Macular Degeneration Disease Th...,5,My mother has Macular Degeneration Disease Thi...,na,0.312500,1,na
4,This microwave has been barely used at all an...,1,This microwave has been barely used at all and...,na,-0.216667,-1,na
...,...,...,...,...,...,...,...
3862,I love this thing mostly cuz it came with Alexa,4,I love this thing mostly cuz it came with Alexa,na,0.500000,1,na
3863,This microwave has stopped working everytime ...,5,This microwave has stopped working everytime w...,na,0.000000,-1,na
3864,Alexa stopped working on the microwave a week...,1,Alexa stopped working on the microwave a week ...,na,0.000000,-1,na
3865,I received this as a Christmas gift and absol...,5,I received this as a Christmas gift and absolu...,na,0.243750,1,na


Unnamed: 0,labels,values,sentiments
0,waste,63,negative
1,eneryg,0,negative
2,environmental,1,positive
3,material,1,positive


In [9]:
# Data check

data_selected = data[data.comps_mention == df_chart.labels[2]] # check number accordingly
data_selected.reset_index(drop=True, inplace=True)

data_selected

Unnamed: 0,text,rating,summary,comps_mention,polarity,pol_count,sus_mention


## Zero-Shot-Classification

In [10]:
classifier = pipeline("zero-shot-classification", device=0)

No model was supplied, defaulted to facebook/bart-large-mnli (https://huggingface.co/facebook/bart-large-mnli)


### Specification Categorization
- With Sentiment Analysis 

In [11]:
display(specifications)
#Applying it into the dataframe base on original review
data['category'] = data['text'].apply(lambda x: classifier(x, candidate_labels=specifications))

data

['Power',
 'Interior',
 'Levels',
 'Dimensions',
 'Technology',
 'Weight',
 'Durability',
 'Packaging',
 'Delivery']



Unnamed: 0,text,rating,summary,comps_mention,polarity,pol_count,sus_mention,category
0,My boyfriend bought this for our house and ou...,1,,panel,0.000000,-1,na,{'sequence': ' My boyfriend bought this for ou...
1,This is a good small microwave for someone in...,5,This is a good small microwave for someone in ...,na,0.225000,1,na,{'sequence': ' This is a good small microwave ...
2,This microwave is not powerful and heats food...,1,This microwave is not powerful and heats food ...,na,0.050000,1,na,{'sequence': ' This microwave is not powerful ...
3,My mother has Macular Degeneration Disease Th...,5,My mother has Macular Degeneration Disease Thi...,na,0.312500,1,na,{'sequence': ' My mother has Macular Degenerat...
4,This microwave has been barely used at all an...,1,This microwave has been barely used at all and...,na,-0.216667,-1,na,{'sequence': ' This microwave has been barely ...
...,...,...,...,...,...,...,...,...
3862,I love this thing mostly cuz it came with Alexa,4,I love this thing mostly cuz it came with Alexa,na,0.500000,1,na,{'sequence': ' I love this thing mostly cuz it...
3863,This microwave has stopped working everytime ...,5,This microwave has stopped working everytime w...,na,0.000000,-1,na,{'sequence': ' This microwave has stopped work...
3864,Alexa stopped working on the microwave a week...,1,Alexa stopped working on the microwave a week ...,na,0.000000,-1,na,{'sequence': ' Alexa stopped working on the mi...
3865,I received this as a Christmas gift and absol...,5,I received this as a Christmas gift and absolu...,na,0.243750,1,na,{'sequence': ' I received this as a Christmas ...


In [12]:
#Replacing the category with the top matched category
for i in tqdm(range(data.shape[0])):
    if data['category'][i] not in specifications:
        data['category'][i] = data['category'][i].get('labels')[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['category'][i] = data['category'][i].get('labels')[0]
100%|██████████| 3867/3867 [00:30<00:00, 126.73it/s]


In [13]:
data_cat = data.copy()

In [14]:
candidate_results = list(0 for i in range(len(specifications)))


def count(category):
    for i in range(len(specifications)):
        if category == specifications[i]:
            candidate_results[i] = candidate_results[i] + 1

In [15]:
# Run the counting function
data_cat['category'].apply(lambda x: count(x))

candidate_results


[413, 688, 25, 68, 1995, 43, 530, 23, 82]

In [16]:
sentiment_list_func = []
for func in specifications:
    data_count = data[data.category == func]
    sen_ana = data_count['pol_count'].sum()
    if sen_ana > 0:
        sen_ana = 'positive'
    else:
        sen_ana = 'negative'
    sentiment_list_func += [sen_ana]
    # df_chart['sentiments'] = [df_chart['labels'] == comps]

# data['pol_count'].sum()

chart = {'labels': specifications,
        'values': candidate_results,
        'sentiments': sentiment_list_func}

df_chart = pd.DataFrame(chart, columns=['labels','values', 'sentiments'])

df_chart

Unnamed: 0,labels,values,sentiments
0,Power,413,positive
1,Interior,688,positive
2,Levels,25,negative
3,Dimensions,68,positive
4,Technology,1995,positive
5,Weight,43,positive
6,Durability,530,positive
7,Packaging,23,positive
8,Delivery,82,positive


### Sustainability Categorization 
- Sentiment Analysis included

In [17]:
#Categorying in terms of sustainability

#Applying it into the dataframe base on original review
data['sus_category'] = data['text'].apply(lambda x: classifier(x, candidate_labels=wiki_adjusted))

data



Unnamed: 0,text,rating,summary,comps_mention,polarity,pol_count,sus_mention,category,sus_category
0,My boyfriend bought this for our house and ou...,1,,panel,0.000000,-1,na,Technology,{'sequence': ' My boyfriend bought this for ou...
1,This is a good small microwave for someone in...,5,This is a good small microwave for someone in ...,na,0.225000,1,na,Technology,{'sequence': ' This is a good small microwave ...
2,This microwave is not powerful and heats food...,1,This microwave is not powerful and heats food ...,na,0.050000,1,na,Power,{'sequence': ' This microwave is not powerful ...
3,My mother has Macular Degeneration Disease Th...,5,My mother has Macular Degeneration Disease Thi...,na,0.312500,1,na,Technology,{'sequence': ' My mother has Macular Degenerat...
4,This microwave has been barely used at all an...,1,This microwave has been barely used at all and...,na,-0.216667,-1,na,Interior,{'sequence': ' This microwave has been barely ...
...,...,...,...,...,...,...,...,...,...
3862,I love this thing mostly cuz it came with Alexa,4,I love this thing mostly cuz it came with Alexa,na,0.500000,1,na,Technology,{'sequence': ' I love this thing mostly cuz it...
3863,This microwave has stopped working everytime ...,5,This microwave has stopped working everytime w...,na,0.000000,-1,na,Interior,{'sequence': ' This microwave has stopped work...
3864,Alexa stopped working on the microwave a week...,1,Alexa stopped working on the microwave a week ...,na,0.000000,-1,na,Technology,{'sequence': ' Alexa stopped working on the mi...
3865,I received this as a Christmas gift and absol...,5,I received this as a Christmas gift and absolu...,na,0.243750,1,na,Technology,{'sequence': ' I received this as a Christmas ...


In [18]:
#Replacing the category with the top matched category
for i in tqdm(range(data.shape[0])):
    if data['sus_category'][i] not in wiki_adjusted:
        data['sus_category'][i] = data['sus_category'][i].get('labels')[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sus_category'][i] = data['sus_category'][i].get('labels')[0]
100%|██████████| 3867/3867 [00:30<00:00, 128.14it/s]


In [19]:
data_cat = data.copy()

candidate_results = list(0 for i in range(len(wiki_adjusted)))


def count(category):
    for i in range(len(wiki_adjusted)):
        if category == wiki_adjusted[i]:
            candidate_results[i] = candidate_results[i] + 1

In [20]:
# Run the counting function
data_cat['sus_category'].apply(lambda x: count(x))

candidate_results


[307, 2358, 5, 1197]

In [21]:
sentiment_list_func = []
for func in wiki_adjusted:
    data_count = data[data.category == func]
    sen_ana = data_count['pol_count'].sum()
    if sen_ana > 0:
        sen_ana = 'positive'
    else:
        sen_ana = 'negative'
    sentiment_list_func += [sen_ana]
    # df_chart['sentiments'] = [df_chart['labels'] == comps]

# data['pol_count'].sum()

chart = {'labels': wiki_adjusted,
        'values': candidate_results,
        'sentiments': sentiment_list_func}

df_chart = pd.DataFrame(chart, columns=['labels','values', 'sentiments'])

df_chart

Unnamed: 0,labels,values,sentiments
0,waste,307,negative
1,eneryg,2358,negative
2,environmental,5,negative
3,material,1197,negative


In [22]:
data.to_pickle('data_analysed.pkl')

data

Unnamed: 0,text,rating,summary,comps_mention,polarity,pol_count,sus_mention,category,sus_category
0,My boyfriend bought this for our house and ou...,1,,panel,0.000000,-1,na,Technology,eneryg
1,This is a good small microwave for someone in...,5,This is a good small microwave for someone in ...,na,0.225000,1,na,Technology,material
2,This microwave is not powerful and heats food...,1,This microwave is not powerful and heats food ...,na,0.050000,1,na,Power,material
3,My mother has Macular Degeneration Disease Th...,5,My mother has Macular Degeneration Disease Thi...,na,0.312500,1,na,Technology,material
4,This microwave has been barely used at all an...,1,This microwave has been barely used at all and...,na,-0.216667,-1,na,Interior,eneryg
...,...,...,...,...,...,...,...,...,...
3862,I love this thing mostly cuz it came with Alexa,4,I love this thing mostly cuz it came with Alexa,na,0.500000,1,na,Technology,material
3863,This microwave has stopped working everytime ...,5,This microwave has stopped working everytime w...,na,0.000000,-1,na,Interior,eneryg
3864,Alexa stopped working on the microwave a week...,1,Alexa stopped working on the microwave a week ...,na,0.000000,-1,na,Technology,material
3865,I received this as a Christmas gift and absol...,5,I received this as a Christmas gift and absolu...,na,0.243750,1,na,Technology,material
