<a href="https://colab.research.google.com/github/lebe1/text-oriented-data-science-project/blob/main/Data_Exploration_Sentemint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment

## Contect to google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
folder_path = '/content/drive/MyDrive/DOPP_Ex2_data/'
csv_path = '/content/drive/MyDrive/DOPP_Ex2_data/combined_reviews.csv'

## Imports

In [3]:
import pandas as pd
import json
import os
import nltk
from nltk.corpus import stopwords
import re
from tqdm import tqdm
import string
import plotly.express as px
from transformers import pipeline
import ast
import plotly.graph_objects as go

In [4]:
from tqdm import tqdm
tqdm.pandas()

## Load data

In [5]:
df = pd.read_csv(csv_path)

In [6]:
df.head(5)

Unnamed: 0,rating,reviewTime,reviewerID,reviewText,summary,unixReviewTime,category,reviewToken
0,5.0,2017-01-16,ASWLL1VJA7WOG,Great product... just what I wanted. Works gr...,Five Stars,1484524800,All_Beauty,"['great', 'product', 'want', 'works', 'great',..."
1,5.0,2008-12-08,A265K3A7V83112,"After seeing the popularity of this shoe, I de...",What can i say? chucks rock,1228694400,Clothing_Shoes_and_Jewelry,"['see', 'popularity', 'shoe', 'decide', 'test'..."
2,5.0,2013-02-08,A1D18EJF6LHYDV,I was nervousness about the scent because IVe ...,Smells great,1360281600,All_Beauty,"['nervousness', 'scent', 'ive', 'never', 'try'..."
3,5.0,2018-02-15,A25EOTX5I354I2,"I LOVE the smell. A bit expensive, so I cant b...",Five Stars,1518652800,Luxury_Beauty,"['love', 'smell', 'bit', 'expensive', 'buy', '..."
4,5.0,2013-11-11,A1DFZPQPCHBYTY,Found this stuff in Japan and wondered if I co...,Super lathery nice soap!,1384128000,All_Beauty,"['found', 'stuff', 'japan', 'wonder', 'could',..."


## Sentiment Analysis

In [7]:
from textblob import TextBlob

def analyze_with_textblob(tokens):
  return TextBlob(str(tokens)).sentiment.polarity

In [8]:
df['sentiment_score_text_blob'] = df['reviewText'].progress_apply(analyze_with_textblob)

100%|██████████| 12000/12000 [00:07<00:00, 1608.79it/s]


In [9]:
df[['reviewText','rating', 'sentiment_score_text_blob']].head(5)

Unnamed: 0,reviewText,rating,sentiment_score_text_blob
0,Great product... just what I wanted. Works gr...,5.0,0.75
1,"After seeing the popularity of this shoe, I de...",5.0,0.393269
2,I was nervousness about the scent because IVe ...,5.0,0.511111
3,"I LOVE the smell. A bit expensive, so I cant b...",5.0,0.0
4,Found this stuff in Japan and wondered if I co...,5.0,0.333333


In [10]:
fig = px.box(df, x="rating", y="sentiment_score_text_blob", color="rating", facet_col="category")
fig.show()

In [11]:
fig = px.box(df, x="category", y="sentiment_score_text_blob", color="category", facet_col="rating")
fig.show()

In [12]:
df['reviewTime'] = pd.to_datetime(df['reviewTime'])
df = df[(df['reviewTime'] >= '2016-01-01') & (df['reviewTime'] <= '2018-12-31')]
df['month'] = df['reviewTime'].dt.to_period('M').dt.to_timestamp()

summary_stats = df.groupby(['month', 'category']).agg(
    mean_sentiment=('sentiment_score_text_blob', 'mean'),
    std_dev=('sentiment_score_text_blob', 'std')
).reset_index()

summary_stats



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,month,category,mean_sentiment,std_dev
0,2016-01-01,All_Beauty,0.533733,0.349716
1,2016-01-01,Amazon_Fashion,0.040264,0.113851
2,2016-01-01,Clothing_Shoes_and_Jewelry,0.353529,0.298267
3,2016-01-01,Luxury_Beauty,0.250302,0.289401
4,2016-02-01,All_Beauty,0.452919,0.326995
...,...,...,...,...
117,2018-06-01,Amazon_Fashion,0.311477,0.306455
118,2018-06-01,Luxury_Beauty,0.272538,0.219270
119,2018-07-01,Amazon_Fashion,0.507143,0.196564
120,2018-07-01,Luxury_Beauty,0.031031,0.185305


In [13]:
summary_stats['max_std'] = summary_stats['mean_sentiment'] + summary_stats['std_dev']
summary_stats['min_std'] = summary_stats['mean_sentiment'] - summary_stats['std_dev']

for category in summary_stats['category'].unique():
    category_data = summary_stats[summary_stats['category'] == category]

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=category_data['month'],
        y=category_data['mean_sentiment'],
        mode='lines+markers',
        name=f"{category} - Mean Sentiment",
        line=dict(width=2, color='blue'),
    ))

    fig.add_trace(go.Scatter(
        x=pd.concat([category_data['month'], category_data['month'][::-1]]),
        y=pd.concat([category_data['max_std'], category_data['min_std'][::-1]]),
        fill='toself',
        fillcolor='rgba(135, 206, 250, 0.3)',
        line=dict(width=0),
        hoverinfo="skip",
        showlegend=False,
    ))

    fig.update_layout(
        title=f"Sentiment Over Time for Category: {category}",
        xaxis_title="Month",
        yaxis_title="Sentiment Score",
        yaxis_range=[-1, 1],
        hovermode="x unified",
        template="plotly_white"
    )

    fig.show()

In [14]:
df['month'] = df['reviewTime'].dt.month_name()

fig = px.box(
    df,
    x='month',
    y='sentiment_score_text_blob',
    color='category',
    title="Boxplot of Sentiment Values by Month",
    labels={'month': 'Month', 'sentiment_score_text_blob': 'Sentiment Score'}
)

fig.update_layout(
    xaxis_categoryorder='array',
    xaxis_categoryarray=['January', 'February', 'March', 'April', 'May', 'June',
                         'July', 'August', 'September', 'October', 'November', 'December'],
    yaxis_range=[-1.2, 1.2]
)

fig.show()

In [15]:
df['month'] = pd.Categorical(df['month'],
                             categories=['January', 'February', 'March', 'April', 'May', 'June',
                                         'July', 'August', 'September', 'October', 'November', 'December'],
                             ordered=True)

for category in df['category'].unique():
    category_data = df[df['category'] == category]

    mean_sentiment_df = category_data.groupby('month')['sentiment_score_text_blob'].mean().reset_index()

    fig = px.box(
        category_data,
        x='month',
        y='sentiment_score_text_blob',
        title=f"Boxplot of Sentiment Values by Month for Category: {category}",
        labels={'month': 'Month', 'sentiment_score_text_blob': 'Sentiment Score'}
    )

    fig.add_scatter(
        x=mean_sentiment_df['month'],
        y=mean_sentiment_df['sentiment_score_text_blob'],
        mode='lines+markers',
        name='Mean Sentiment',
        line=dict(color='black', width=2)
    )


    fig.update_layout(
        xaxis_categoryorder='array',
        xaxis_categoryarray=['January', 'February', 'March', 'April', 'May', 'June',
                             'July', 'August', 'September', 'October', 'November', 'December']
    )

    fig.show()

















In [16]:
df['month'] = pd.Categorical(df['month'],
                             categories=['January', 'February', 'March', 'April', 'May', 'June',
                                         'July', 'August', 'September', 'October', 'November', 'December'],
                             ordered=True)

for category in df['category'].unique():
    category_data = df[df['category'] == category]

    mean_sentiment_df = category_data.groupby('month')['sentiment_score_text_blob'].mean().reset_index()
    mean_rating_df = category_data.groupby('month')['rating'].mean().reset_index()

    fig = px.box(
        category_data,
        x='month',
        y='sentiment_score_text_blob',
        title=f"Boxplot of Sentiment Values by Month for Category: {category}",
        labels={'month': 'Month', 'sentiment_score_text_blob': 'Sentiment Score'}
    )

    fig.add_scatter(
        x=mean_sentiment_df['month'],
        y=mean_sentiment_df['sentiment_score_text_blob'],
        mode='lines+markers',
        name='Mean Sentiment',
        line=dict(color='black', width=2)
    )

    fig.add_scatter(
        x=mean_rating_df['month'],
        y=mean_rating_df['rating'],
        mode='lines+markers',
        name='Mean Rating',
        line=dict(color='red', width=2)
    )

    fig.update_layout(
        xaxis_categoryorder='array',
        xaxis_categoryarray=['January', 'February', 'March', 'April', 'May', 'June',
                             'July', 'August', 'September', 'October', 'November', 'December']
    )

    fig.show()

























## Save data

In [17]:
output_path = '/content/drive/MyDrive/DOPP_Ex2_data/reviews_sentiment.csv'
df.to_csv(output_path, index=False)

## Transformer

In [18]:
stop

NameError: name 'stop' is not defined

In [None]:
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

In [None]:
def analyze_with_transformer(text):
    if not isinstance(text, str):
        return {'label': None, 'score': None}

    max_length = 512
    truncated_text = text[:max_length]

    try:
        result = sentiment_analyzer(truncated_text)[0]
    except Exception as e:
        print(f"Error processing text: {e}")
        return {'label': None, 'score': None}

    return result

In [None]:
df['sentiment_transformer'] = df['reviewText'].progress_apply(analyze_with_transformer)

In [None]:
labels = []
scores = []

for sentiment in df['sentiment_transformer']:

    if isinstance(sentiment, list) and len(sentiment) > 0 and isinstance(sentiment[0], dict):
        sentiment_dict = sentiment[0]
        labels.append(sentiment_dict.get('label', None))
        scores.append(sentiment_dict.get('score', None))

    else:
        labels.append(None)
        scores.append(None)

df['sentiment_label_transformer'] = labels
df['sentiment_score_transformer'] = scores

In [None]:
df[['reviewToken','reviewText','rating', 'sentiment_label_transformer', 'sentiment_score_transformer','sentiment_transformer']].head(15)

In [None]:
fig = px.box(df, x="category", y="rating", color="sentiment_label_transformer", title="Rating Boxplot by Category and Sentiment")
fig.show()

In [None]:
output_path = '/content/drive/MyDrive/DOPP_Ex2_data/reviews_sentiment.csv'
df.to_csv(output_path, index=False)