In [8]:
# Basic data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn imports
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    cross_val_score,
    KFold
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    roc_curve,
    roc_auc_score,
    classification_report,
    confusion_matrix
)

# Model imports
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    Ridge,
    Lasso
)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Imbalanced learning
from imblearn.over_sampling import SMOTE
from scipy.stats import ks_2samp
from scipy.stats import mannwhitneyu
from scipy import stats

In [2]:
num = pd.read_csv('rmpCapstoneNum.csv', header=None)
qual = pd.read_csv('rmpCapstoneQual.csv', header=None)
tag = pd.read_csv('rmpCapstoneTags.csv', header=None)

num_columns = [
    "Average Rating",            # The arithmetic mean of all individual quality ratings of this professor
    "Average Difficulty",        # The arithmetic mean of all individual difficulty ratings of this professor
    "Number of Ratings",         # Total number of ratings these averages are based on
    "Received a 'pepper'?",      # Boolean - judged as "hot" by the students
    "Proportion Retake",         # Proportion of students that said they would take the class again
    "Online Ratings Count",      # Number of ratings coming from online classes
    "Male Gender",               # Boolean – 1: determined with high confidence that professor is male
    "Female Gender"              # Boolean – 1: determined with high confidence that professor is female
]
num.columns = num_columns

qual_columns = [
    "Major/Field",  # Column 1: Major/Field
    "University",   # Column 2: University
    "US State"      # Column 3: US State (2-letter abbreviation)
]
qual.columns = qual_columns

tags_columns = [
    "Tough grader",              # Column 1
    "Good feedback",             # Column 2
    "Respected",                 # Column 3
    "Lots to read",              # Column 4
    "Participation matters",     # Column 5
    "Don't skip class",          # Column 6
    "Lots of homework",          # Column 7
    "Inspirational",             # Column 8
    "Pop quizzes!",              # Column 9
    "Accessible",                # Column 10
    "So many papers",            # Column 11
    "Clear grading",             # Column 12
    "Hilarious",                 # Column 13
    "Test heavy",                # Column 14
    "Graded by few things",      # Column 15
    "Amazing lectures",          # Column 16
    "Caring",                    # Column 17
    "Extra credit",              # Column 18
    "Group projects",            # Column 19
    "Lecture heavy"              # Column 20
]
tag.columns = tags_columns

In [3]:
# Ensure the datasets have the same number of records
assert len(num) == len(qual) == len(tag), "Datasets lengths do not match."

# Merge the datasets
merged_df = pd.concat([num, qual, tag], axis=1)

In [4]:
# Q1

In [None]:
# set the threshold to 5 ratings and exclude the professors with less than 5 ratings
print(len(merged_df))
merged_df = merged_df[merged_df['Number of Ratings'] >= 5]
print(len(merged_df))

merged_df = merged_df.dropna(subset='Average Rating')

89893
25368


In [7]:
female_rating = merged_df[merged_df['Male Gender'] == 0]['Average Rating']
male_rating = merged_df[merged_df['Male Gender'] == 1]['Average Rating']

In [9]:
# run welch's t-test
t_stat, p_value = stats.ttest_ind(male_rating, female_rating, 
                                 alternative='greater',  # one-sided test
                                 equal_var=False)  # Welch's t-test (unequal variances)

print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")

# Print means for reference
print(f"\nMale rating mean: {male_rating.mean():.4f}")
print(f"Female rating mean: {female_rating.mean():.4f}")

t-statistic: 10.7285
p-value: 0.0000

Male rating mean: 3.9141
Female rating mean: 3.7866


In [10]:
# Q4
# For each tag column
for tag_column in tags_columns:
    # Create new normalized column
    merged_df[f'{tag_column}_normalized'] = merged_df[tag_column] / merged_df['Number of Ratings']
# create a list of normalized cols
normalized_columns = [f'{tag_column}_normalized' for tag_column in tags_columns]

In [19]:
result = []
for tags in normalized_columns:
    male = merged_df[merged_df['Male Gender'] == 1][tags]
    female = merged_df[merged_df['Male Gender'] == 0][tags]
    t_stat, p_value = stats.mannwhitneyu(male, female, 
                                 alternative='two-sided',  # one-sided test
                                 )  # Welch's t-test (unequal variances)
    result.append((tags, p_value))

In [20]:
# Sort the results by p-value
result.sort(key=lambda x: x[1])
result

[('Hilarious_normalized', 8.499323999196362e-206),
 ('Amazing lectures_normalized', 1.851816834600291e-44),
 ('Respected_normalized', 3.3533262258853186e-32),
 ('Lots of homework_normalized', 1.509977533642015e-25),
 ('Caring_normalized', 6.572070748455326e-23),
 ('Extra credit_normalized', 4.242282844390809e-22),
 ('Participation matters_normalized', 5.184174055707606e-21),
 ('Group projects_normalized', 2.242054641710866e-19),
 ('Graded by few things_normalized', 6.14348410119504e-14),
 ("Don't skip class_normalized", 1.9626175039896763e-10),
 ('Tough grader_normalized', 3.1725123597144213e-09),
 ('Lecture heavy_normalized', 1.830223017094474e-08),
 ('Inspirational_normalized', 1.3951632901818763e-05),
 ('Lots to read_normalized', 0.0019799102664822474),
 ('So many papers_normalized', 0.010550743119841204),
 ('Accessible_normalized', 0.034012681477221945),
 ('Good feedback_normalized', 0.034968507886676686),
 ('Clear grading_normalized', 0.03896696872117285),
 ('Pop quizzes!_normaliz