In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from scipy import stats

In [None]:
# Pearson's Correlation Hypothesis Testing
# H0:  There is no correlation between Amazon product ratings and their sentiment scores (correlation coefficient = 0)
# HA: There is a correlation between Amazon product ratings and their sentiment scores (correlation coefficient ≠ 0)

In [None]:
# Load the CSV file into a DataFrame
df = pd.read_csv('amazon_reviews_compound_scores.csv')
print(df)

      Unnamed: 0  reviewerName  overall  \
0              0           NaN      4.0   
1              1          0mie      5.0   
2              2           1K3      4.0   
3              3           1m2      5.0   
4              4  2&amp;1/2Men      5.0   
...          ...           ...      ...   
4910        4910        ZM "J"      1.0   
4911        4911            Zo      5.0   
4912        4912     Z S Liske      5.0   
4913        4913      Z Taylor      5.0   
4914        4914           Zza      5.0   

                                             reviewText  reviewTime  day_diff  \
0                                            No issues.  2014-07-23       138   
1     Purchased this for my device, it worked as adv...  2013-10-25       409   
2     it works as expected. I should have sprung for...  2012-12-23       715   
3     This think has worked out great.Had a diff. br...  2013-11-21       382   
4     Bought it with Retail Packaging, arrived legit...  2013-07-13       513 

In [None]:
# find correlation coefficient and p value
correlation_coefficient, p_value = stats.pearsonr(df['compound'], df['overall'])
print(correlation_coefficient, p_value)

0.3598429979836059 3.609849358229258e-150


In [None]:
# Correlation coefficient is about 0.359, indicating weak positive relationship between the two variables.
# P value is less than significance level of 0.05, so we reject the null hypothesis.
# Therefore, we have enough evidence to suggest that Amazon ratings of products and customers' sentiment scores
# have a relationship, although weak, as indicated by the correlation coefficient.

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

In [None]:
# Chi Square Test
# Research question: Are there specific words in the review text that are more commonly associated with different rating levels?
# H0: There is no association between commonly used words and Amazon product ratings
# HA: There is an association between commonly used words and Amazon product ratings


In [None]:
# Hypothesis Testing for Chi Square
from scipy.stats import chi2_contingency
# Create a list of unique words from processed review text for the contingency table
unique_words = list(set(' '.join(df['processed_reviewText'].dropna()).split()))

# Create a contingency table
contingency_table = pd.DataFrame(0, index=unique_words, columns=df['overall'].unique())

# Count occurrences of each word in each rating
for rating in df['overall'].unique():
    for word in unique_words:
        count = df[df['overall'] == rating]['processed_reviewText'].str.contains(word, case=False).sum()
        contingency_table.loc[word, rating] = count

# Perform the Chi-Squared test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Print the results
print("\nChi-Squared Test Results:")
print(f"Chi2 Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")

Contingency Table:
              4.0  5.0  3.0  1.0  2.0
onlinewell      0    1    0    0    0
terms           1    6    0    1    0
falsely         0    1    0    0    0
far            50  296    6   18    5
housekeeping    0    1    0    0    0
...           ...  ...  ...  ...  ...
requiredthis    0    1    0    0    0
transmit        0    0    0    0    1
realised        1    0    0    0    0
naturally       0    1    0    2    0
conjunction     0    1    0    0    0

[9677 rows x 5 columns]

Chi-Squared Test Results:
Chi2 Statistic: 71111.6882332727
P-value: 0.0
Degrees of Freedom: 38704
Expected Frequencies:
[[0.1143088  0.74092682 0.03657296 0.08056099 0.02763043]
 [0.91447038 5.92741457 0.29258366 0.64448793 0.22104346]
 [0.1143088  0.74092682 0.03657296 0.08056099 0.02763043]
 ...
 [0.1143088  0.74092682 0.03657296 0.08056099 0.02763043]
 [0.34292639 2.22278046 0.10971887 0.24168297 0.0828913 ]
 [0.1143088  0.74092682 0.03657296 0.08056099 0.02763043]]

Reject the null hypothes

In [None]:
# The chi squared testic is extremely high, with a value of 71111, indicating a significant difference between
# the observed and expected frequencies in the frequency table. This suggests a strong association between
# the words used in reviews and the Amazon product ratings. This means that specific words that are
# tend to be commonly assoicated with different rating levels. This is further supported by the p value, which is
# close to 0. Since the p value is small and less than significance level of 0.05, we can reject the null
# hypothesis. Therefore, there is enough evidence to suggest that there is an association between specific words
# and Amazon product ratings.
