# Exploring Numeric Review Data

I'd like the employee review section to have a numeric summary of the source data at the top, then the subjective summary, then the links to the sources.

Design intentions:
- The numeric summary should not serve as a replacement for the sources (Glassdoor, Steam, etc)
- The user should be informed about potential sampling biases

Questions the data should answer:
- Is the company good or bad in general? Note: This will only be knowable if there are references, which there may not be
- Is the company improving or worsening?

Other considerations:
- Could I inject these summaries into the quote reducers to combat bias?

In [1]:
# Inputs:
data_folder = "../output/data"
website_folder = "../docs"

import glob

# List all JSON files in the data_folder
json_files = glob.glob(f"{data_folder}/*.json")
# print(json_files)

from unified import UnifiedResult
import json
from typing import Dict

results: Dict[str, UnifiedResult] = {}

for file_path in json_files:
    with open(file_path, 'r') as file:
        data = json.load(file)
        results[file_path] = UnifiedResult(**data)

# print(results)

In [35]:
json_files

['../output/data/Center_CenterCard.json',
 '../output/data/Nabla.json',
 '../output/data/Ontra.json',
 '../output/data/Snap_Inc_.json',
 '../output/data/Vulcan_Inc_.json',
 '../output/data/Optimize_Health.json',
 '../output/data/Akili_Interactive_EndeavorRx.json',
 '../output/data/DoubleDown_Interactive.json',
 '../output/data/2K_Games.json',
 '../output/data/ThousandEyes.json',
 '../output/data/Imagine_Pediatrics.json',
 '../output/data/98point6.json',
 '../output/data/Verkada_Inc.json',
 '../output/data/Chewy.json',
 '../output/data/Qualtrics.json',
 '../output/data/Anduril.json',
 '../output/data/Block_Square.json',
 '../output/data/Pave_dev_Cashflow.json',
 '../output/data/Dolly.json',
 '../output/data/Hotel_Engine.json',
 '../output/data/Indeed.json',
 '../output/data/Hiya.json',
 '../output/data/Current.json',
 '../output/data/Veritone.json',
 '../output/data/Sudowrite.json',
 '../output/data/Built_In.json',
 '../output/data/Remitly.json',
 '../output/data/Adswerve.json',
 '../ou

In [40]:
example_results = [result for result in results.values() if result.target.company == "A Place for Mom"]
example = example_results[0]


In [41]:
example



In [42]:
# Note: The customer experience results don't keep all of the raw data, so we can't easily generate what we want. The Glassdoor results have it though

In [66]:
from data_sources.glassdoor import GlassdoorResult
import numpy as np
from scipy import stats

# Could be useful: raw_reviews["lastReviewDateTime"]

def summarize_sampling_local(result: GlassdoorResult, alpha=0.05) -> str:
    """Summarize the review stats compared to the overall distribution"""
    # deduplicate the reviews and warn if there are duplicates
    indexed_reviews = dict()
    for review in result.reviews:
        indexed_reviews[review.reviewId] = review
    if len(indexed_reviews) != len(result.reviews):
        print(f"Warning: {len(result.reviews) - len(indexed_reviews)} duplicate reviews found, deduplicating")
    reviews = list(indexed_reviews.values())

    sample_scores = np.array([review.ratingOverall for review in reviews])
    population_mean = result.raw_reviews["ratings"]["overallRating"]
    t_statistic, p_value = stats.ttest_1samp(sample_scores, population_mean)

    min_date = min(review.reviewDateTime for review in reviews)
    max_date = max(review.reviewDateTime for review in reviews)

    # dates as ints
    sample_dates = np.array([review.reviewDateTime.timestamp() for review in reviews])

    # spearman correlation of dates and scores
    date_score_correlation, date_score_p_value = stats.pearsonr(sample_dates, sample_scores)

    return f"""
Overall stats
Mean: {population_mean:.1f}
Count: {result.raw_reviews["ratings"]["reviewCount"]}
      
Sample stats
Mean: {sample_scores.mean():.1f}
Count: {len(sample_scores)}
Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}

Sample reliability
T-statistic: {t_statistic:.3f}
P-value: {p_value:.3f}
{"Sample is significantly different" if p_value < alpha else "Sample is not significantly different"}

Date-score correlation, from the sample
Correlation: {date_score_correlation:.3f}
P-value: {date_score_p_value:.3f}
{"Correlation is significant" if date_score_p_value < alpha else "Correlation is not significant"}
"""

print(summarize_sampling_local(example.glassdoor_result))


Overall stats
Mean: 4.6
Count: 1320
      
Sample stats
Mean: 4.3
Count: 48
Date range: 2024-03-31 to 2024-10-07

Sample reliability
T-statistic: -1.495
P-value: 0.142
Sample is not significantly different

Date-score correlation, from the sample
Correlation: 0.237
P-value: 0.104
Correlation is not significant



In [50]:
example.glassdoor_result.raw_reviews.keys()

dict_keys(['__typename', 'allReviewsCount', 'currentPage', 'filteredReviewsCount', 'lastReviewDateTime', 'numberOfPages', 'queryJobTitle', 'queryLocation', 'ratedReviewsCount', 'ratings', 'reviews', 'ratingCountDistribution'])

In [52]:
example.glassdoor_result.raw_reviews["ratings"]

{'__typename': 'EmployerRatings',
 'businessOutlookRating': 0.93,
 'careerOpportunitiesRating': 4.6,
 'ceoRating': 0.97,
 'compensationAndBenefitsRating': 4.7,
 'cultureAndValuesRating': 4.7,
 'diversityAndInclusionRating': 4.7,
 'overallRating': 4.6,
 'ratedCeo': {'__typename': 'Ceo',
  'id': 1326324,
  'photoUrl({"size":"LARGE"})': 'https://media.glassdoor.com/people/sqll/240285/a-place-for-mom-ceo1715561313101.png',
  'name': 'Tatyana Zlotsky',
  'photoUrl({"size":"REGULAR"})': 'https://media.glassdoor.com/people/sql/240285/a-place-for-mom-ceo1715561313101.png',
  'title': 'CEO'},
 'recommendToFriendRating': 0.92,
 'reviewCount': 1320,
 'seniorManagementRating': 4.7,
 'workLifeBalanceRating': 4.7}

In [54]:
from core import init, Seed

init()

In [61]:
import data_sources.app_stores.steam as steam

target = Seed.init("Singularity 6", "Palia", domain="singularity6.com")
url = steam.find_steam_page(target)
# url

steam_id = steam.extract_steam_id(url)

# This will be near-instant if cached! Otherwise about 5 sec
steam_reviews = steam.get_reviews(steam_id, num_reviews=5000)
steam_reviews

review_summary_stats = steam.get_review_summary_stats(steam_id)
print(review_summary_stats)

print(steam.summarize_sampling(steam_reviews, review_summary_stats))

num_reviews=1 review_score=8 review_score_desc='Very Positive' total_positive=8506 total_negative=1403 total_reviews=9909

Overall
- 85.8% positive
- Total: 9909

Sample
- 89.0% positive
- Total: 824
- Date range 2024-09-29 to 2024-10-30

Sample representativeness
- Chi-Square Statistic: 5.904
- p-value: 0.015
- The sample distribution is significantly different from the overall distribution



In [67]:
from typing import List
from collections import Counter
from datetime import datetime
import numpy as np
from scipy.stats import chi2_contingency

def summarize_sampling_local(reviews: List[steam.SteamReview], overall: steam.QuerySummary, alpha=0.05) -> str:
    sample_rating_counts = Counter(review.voted_up for review in reviews)

    # Create a contingency table
    observed = np.array([
        [sample_rating_counts[True], sample_rating_counts[False]],
        [overall.total_positive, overall.total_negative]
    ])

    # Perform the Chi-Square test
    chi2, p_value, dof, expected = chi2_contingency(observed)

    # Interpret the results
    if p_value < alpha:
        significance = "significantly different"
    else:
        significance = "not significantly different"

    min_review_date = min(review.timestamp_created for review in reviews)
    min_review_date = datetime.fromtimestamp(min_review_date)
    max_review_date = max(review.timestamp_created for review in reviews)
    max_review_date = datetime.fromtimestamp(max_review_date)

    # date correlation
    sample_dates = np.array([review.timestamp_created for review in reviews])
    date_score_correlation, date_score_p_value = stats.pearsonr(sample_dates, [review.voted_up for review in reviews])

    return f"""
Overall
- {overall.total_positive / overall.total_reviews:.1%} positive
- Total: {overall.total_reviews}

Sample
- {sample_rating_counts[True] / len(reviews):.1%} positive
- Total: {len(reviews)}
- Date range {min_review_date.strftime('%Y-%m-%d')} to {max_review_date.strftime('%Y-%m-%d')}

Sample representativeness
- Chi-Square Statistic: {chi2:.3f}
- p-value: {p_value:.3f}
- The sample distribution is {significance} from the overall distribution

Date-score correlation
- Correlation: {date_score_correlation:.3f}
- p-value: {date_score_p_value:.3f}
"""

print(summarize_sampling_local(steam_reviews, review_summary_stats))


Overall
- 85.8% positive
- Total: 9909

Sample
- 89.0% positive
- Total: 824
- Date range 2024-09-29 to 2024-10-30

Sample representativeness
- Chi-Square Statistic: 5.904
- p-value: 0.015
- The sample distribution is significantly different from the overall distribution

Date-score correlation
- Correlation: 0.036
- p-value: 0.297



In [68]:
import data_sources.app_stores.apple as apple_app_store
from core import Seed

target = Seed.init("98point6")

url = apple_app_store.find_app_store_page(target)
apple_app_store_id = apple_app_store.extract_apple_app_store_id(url)
print(apple_app_store_id)

apple_reviews = apple_app_store.scrape(apple_app_store_id)
apple_reviews

1157653928


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [69]:
import data_sources.app_stores.google_play as google_play

target = Seed.init("98point6")

url = google_play.find_google_play_page(target)
google_play_id = google_play.extract_google_play_app_id(url)
app_info = google_play.scrape_app_info(google_play_id)
google_play_reviews = google_play.scrape_reviews(google_play_id)

print(google_play.summarize_sampling(app_info, google_play_reviews))


# 98point6
On-demand, 24/7, text-based care via secure, in-app messaging.

Overall
- 3.27
- 2888 ratings
- Approximate date range Jun 26, 2017 to Jun 5, 2024

Sample
- 1.83
- 100 reviews
- Date range 2022-01-10 to 2024-10-19

Sample representativeness
- K-S Statistic: 0.386
- p-value: 0.000
- The sample distribution is significantly different from the overall distribution



In [71]:
def summarize_sampling_local(app_info: google_play.GooglePlayAppInfo, reviews: List[google_play.GooglePlayReview], alpha=0.05) -> str:
    """Summarize the review stats compared to the overall distribution"""
    sample_scores = np.array([review.score for review in reviews])
    overall_scores = google_play.histogram_to_array(app_info.histogram)

    # Perform the two-sample K-S test
    ks_statistic, p_value = stats.ks_2samp(sample_scores, overall_scores)

    # Interpret the results
    if p_value < alpha:
        significance = "significantly different"
    else:
        significance = "not significantly different"

    sample_mean = sum(review.score for review in reviews) / len(reviews)
    reviews_min_date = min(review.at for review in reviews)
    reviews_max_date = max(review.at for review in reviews)

    # date correlation
    sample_dates = np.array([review.at.timestamp() for review in reviews])
    date_score_correlation, date_score_p_value = stats.pearsonr(sample_dates, sample_scores)

    return f"""
# {app_info.title}
{app_info.summary}

Overall
- {app_info.score:.2f}
- {app_info.ratings} ratings
- Approximate date range {app_info.released} to {app_info.lastUpdatedOn}

Sample
- {sample_mean:.2f}
- {len(reviews)} reviews
- Date range {reviews_min_date.strftime('%Y-%m-%d')} to {reviews_max_date.strftime('%Y-%m-%d')}
- Date-score correlation: P={date_score_p_value:.3f}

Sample representativeness
- K-S Statistic: {ks_statistic:.3f}
- p-value: {p_value:.3f}
- The sample distribution is {significance} from the overall distribution
"""

print(summarize_sampling_local(app_info, google_play_reviews))


# 98point6
On-demand, 24/7, text-based care via secure, in-app messaging.

Overall
- 3.27
- 2888 ratings
- Approximate date range Jun 26, 2017 to Jun 5, 2024

Sample
- 1.83
- 100 reviews
- Date range 2022-01-10 to 2024-10-19
- Date-score correlation: P=0.776

Sample representativeness
- K-S Statistic: 0.386
- p-value: 0.000
- The sample distribution is significantly different from the overall distribution

