<a href="https://colab.research.google.com/github/kumpaten/masters-thesis-code/blob/main/GlassdoorData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

# Load the CSV file from your Google Drive
df = pd.read_csv('/content/drive/My Drive/glassdoor_reviews.csv')

# Filter to target firms
target_firms = [
    "Apple", "Microsoft", "Google", "Cisco-Systems", "Oracle",
    "IBM", "SAP", "Facebook", "Accenture", "Salesforce"
]
df_target = df[df['firm'].isin(target_firms)].copy()

# Ensure that the date column is in datetime format and extract the year
df_target['date_review'] = pd.to_datetime(df_target['date_review'], errors='coerce')
df_target['Year'] = df_target['date_review'].dt.year

# Identify the numeric columns for which we want to calculate the average
numeric_cols = [
    'overall_rating',
    'work_life_balance',
    'culture_values',
    'diversity_inclusion',
    'career_opp',
    'comp_benefits',
    'senior_mgmt'
]

# Group by firm and Year, calculating the mean for each numeric column
aggregated_means = (
    df_target.groupby(['firm', 'Year'])[numeric_cols]
             .mean()
             .reset_index()
)

# Calculate the number of rows (i.e., reviews) in each group
row_counts = (
    df_target.groupby(['firm', 'Year'])
             .size()
             .reset_index(name='row_count')
)

# Merge the aggregated means with the row counts
result = pd.merge(aggregated_means, row_counts, on=['firm', 'Year'])

# Display the result
print(result)


In [None]:
import pandas as pd

# Group by 'firm' and 'Year' and count the number of reviews (non-null overall_rating values)
review_counts = (
    df_target.groupby(['firm', 'Year'])['overall_rating']
             .count()
             .reset_index(name='review_count')
)

# Display the counts
print("Number of reviews for each company and each year:")
print(review_counts)

# pivot table to get a clearer view:
review_pivot = review_counts.pivot(index='firm', columns='Year', values='review_count')
print("\nPivot table of review counts:")
print(review_pivot)


In [None]:
import pandas as pd
from google.colab import files

# Group by 'firm' and 'Year' to calculate the average overall rating and review count
summary_df = df_target.groupby(['firm', 'Year']).agg(
    avg_overall_rating=('overall_rating', 'mean'),
    review_count=('overall_rating', 'count')
).reset_index()

# Display the first few rows of the summary
print(summary_df.head())

In [None]:
import pandas as pd
import re
from google.colab import files

# 1. Add rows for Facebook from 2008 to 2011 if missing
company_to_fix = "Facebook"
missing_years = range(2008, 2012)  # 2008..2011 inclusive

for year in missing_years:
    mask = (summary_df["firm"] == company_to_fix) & (summary_df["Year"] == year)
    if mask.sum() == 0:  # no row found
        new_row_df = pd.DataFrame([{
            "firm": company_to_fix,
            "Year": year,
            "avg_overall_rating": 0.0,
            "review_count": 0
        }])
        summary_df = pd.concat([summary_df, new_row_df], ignore_index=True)

# 2. Sort by company and year (optional, for clarity)
summary_df.sort_values(["firm", "Year"], inplace=True, ignore_index=True)

# 3. Compute a company-specific average rating for each company
#    This will serve as the prior (m_c) for that company.
summary_df["firm_avg"] = summary_df.groupby("firm")["avg_overall_rating"].transform("mean")

# 4. Choose a prior count C (how strongly we trust the company's average)
C = 50

# 5. Apply the Bayesian smoothing formula (company-specific)
#    smoothed_rating_{c,y} = ((review_count_{c,y} * avg_overall_rating_{c,y}) + (C * m_c)) / (review_count_{c,y} + C)
summary_df["smoothed_rating"] = (
    (summary_df["review_count"] * summary_df["avg_overall_rating"] + C * summary_df["firm_avg"]) /
    (summary_df["review_count"] + C)
)

# 6. Save the updated DataFrame to CSV
summary_df.to_csv("Glassdoor_yearly_review_summary_updated.csv", index=False)

print("The DataFrame now has missing Facebook rows added, is sorted, and uses company-specific smoothed ratings.")

# Display the first few rows of the summary
print(summary_df.head())

# Save the summary to a new CSV file
output_path = '/content/drive/My Drive/Glassdoor_yearly_review_summary_updated.csv'
summary_df.to_csv(output_path, index=False)