# Exploratory Data Analysis (EDA)
Project: **Bangla Product Review Sentiment Analysis**

Objectives of this notebook:
- Explore the raw data and gather statistics
- Explore the change after majority voting
- Word frequency analysis

## Load Dataset

In [1]:
import pandas as pd

In [2]:
df1 = pd.read_excel("data/data_1.xlsx")
df2 = pd.read_excel("data/data_2.xlsx")
df3 = pd.read_excel("data/data_3.xlsx")

In [3]:
df_mod = pd.read_excel("data/df_mod.xlsx")

## Output Class Ratio

In [4]:
print("\nDataset 1 Percent Value Count")
print(round(df1["sentiment"].value_counts(normalize=True)*100))
print("\nDataset 2 Percent Value Count")
print(round(df2["sentiment"].value_counts(normalize=True)*100))
print("\nDataset 3 Percent Value Count")
print(round(df3["sentiment"].value_counts(normalize=True)*100))
print("\nDataset Combined Percent Value Count")
print(round(df_mod["sentiment_majority"].value_counts(normalize=True)*100))
print("\nDataset 1 Value Count")
print(df1["sentiment"].value_counts())
print("\nDataset 2 Value Count")
print(df2["sentiment"].value_counts())
print("\nDataset 3 Value Count")
print(df3["sentiment"].value_counts())
print("\nDataset Combined Value Count")
print(df_mod["sentiment_majority"].value_counts())


Dataset 1 Percent Value Count
neutral     46.0
positive    39.0
negative    15.0
Name: sentiment, dtype: float64

Dataset 2 Percent Value Count
neutral     58.0
positive    30.0
negative    12.0
Name: sentiment, dtype: float64

Dataset 3 Percent Value Count
positive    50.0
neutral     32.0
negative    17.0
Name: sentiment, dtype: float64

Dataset Combined Percent Value Count
neutral     46.0
positive    41.0
negative    14.0
Name: sentiment_majority, dtype: float64

Dataset 1 Value Count
neutral     240
positive    206
negative     77
Name: sentiment, dtype: int64

Dataset 2 Value Count
neutral     303
positive    157
negative     63
Name: sentiment, dtype: int64

Dataset 3 Value Count
positive    264
neutral     169
negative     91
Name: sentiment, dtype: int64

Dataset Combined Value Count
neutral     241
positive    214
negative     73
Name: sentiment_majority, dtype: int64


## Word Frequency Analysis

In [5]:
import string
punc_list = string.punctuation
english_letters = string.ascii_letters
english_numbers = string.digits

In [6]:
punctuation_marks = list(set(['।', ',', ';', ':', '?', '!', "'", '.', '"', '-', '[', ']', '{', '}', '(', ')', '–', '—', '―', '~', ]
                        + list(punc_list)))
exclude_symbols = punctuation_marks + list(english_letters) + list(english_numbers)

In [7]:
def get_word_freq(sentiment, df):
    """
    This function will take dataframe and sentiment value and return dataframe with required stats such as
    word count, total word count and frequency
    Inputs: 
    sentiment = positive, neutral or negative
    df = full dataframe
    Outputs:
    df_word_freq = processed df with necessary stats
    """
    df_sentiment = df[(df['sentiment_majority'] == sentiment)]
    initial_concat_string = ' '.join(df_sentiment.review)
    for each_punctuation_mark in exclude_symbols:
        if each_punctuation_mark in initial_concat_string:
            initial_concat_string = initial_concat_string.replace(each_punctuation_mark, " ")
    word_freq = pd.DataFrame(pd.Series(initial_concat_string.split()).value_counts())
    word_freq.columns = ['frequency_'+sentiment]
    word_freq["word"] = word_freq.index
    word_freq = word_freq[["word", 'frequency_'+sentiment]]
    word_freq["frequency_"+sentiment] = word_freq["frequency_"+sentiment]/len(' '.join(df_sentiment.review).split())
    return word_freq

In [8]:
word_freq_positive = get_word_freq("positive", df_mod)
word_freq_positive.head(10)

Unnamed: 0,word,frequency_positive
ভালো,ভালো,0.019348
আমি,আমি,0.014925
খুব,খুব,0.014373
এবং,এবং,0.014373
সুন্দর,সুন্দর,0.012161
এর,এর,0.011056
একটি,একটি,0.011056
এই,এই,0.010503
১০,১০,0.00995
ধন্যবাদ,ধন্যবাদ,0.009397


In [9]:
word_freq_negative = get_word_freq("negative", df_mod)
word_freq_negative.head(10)

Unnamed: 0,word,frequency_negative
না,না,0.026521
ছিল,ছিল,0.01248
নষ্ট,নষ্ট,0.01248
আর,আর,0.01092
আমি,আমি,0.01092
কিন্তু,কিন্তু,0.00936
অনেক,অনেক,0.00936
নেই,নেই,0.0078
ও,ও,0.0078
করে,করে,0.0078


In [10]:
word_freq_neutral = get_word_freq("neutral", df_mod)
word_freq_neutral.head(10)

Unnamed: 0,word,frequency_neutral
না,না,0.015048
আমি,আমি,0.011856
আমার,আমার,0.010488
করে,করে,0.008664
আর,আর,0.008664
এর,এর,0.008664
এবং,এবং,0.007752
জন্য,জন্য,0.007296
হবে,হবে,0.00684
পারেন,পারেন,0.00684


In [11]:
word_freq_merged = word_freq_positive.merge(word_freq_negative, how = "outer").merge(word_freq_neutral, how = "outer")
word_freq_merged.head()

Unnamed: 0,word,frequency_positive,frequency_negative,frequency_neutral
0,ভালো,0.019348,0.00156,0.005016
1,আমি,0.014925,0.01092,0.011856
2,খুব,0.014373,0.00468,0.004104
3,এবং,0.014373,0.00468,0.007752
4,সুন্দর,0.012161,,0.000456


## Data Visualization

In [13]:
import plotly.express as px

In [14]:
import plotly.graph_objects as go
sentiment=['Positive', 'Negative', 'Neutral']

fig = go.Figure(data=[
    go.Bar(name='Dataset: 1', x=sentiment, y=[39, 15, 46]),
    go.Bar(name='Dataset: 2', x=sentiment, y=[30, 12, 58]),
    go.Bar(name='Dataset: 3', x=sentiment, y=[50, 17, 32]),
    go.Bar(name='Dataset: Majority Voting', x=sentiment, y=[41, 14, 46]),
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()