# Validation of Statistical Method for TPIR Contestant Fairness

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

---
## Cleaning up DataFrame and drop any duplicate data

In [2]:
# Cleaning up the DataFrame
df = pd.read_excel("./price_is_right.xlsx", header=None).transpose()
df.columns = df.iloc[0]
df = df.drop(index=0).reset_index(drop=True)
df.drop(columns=df.columns[0],inplace=True)
df['date'] = pd.to_datetime(df['date'])
df.set_index('date',inplace=True)
df = df.drop_duplicates()
df = df.astype(int)
og_df = df
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 978 entries, 1972-09-04 to 1996-05-03
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   first   978 non-null    int32
 1   second  978 non-null    int32
 2   third   978 non-null    int32
 3   fourth  978 non-null    int32
 4   retail  978 non-null    int32
dtypes: int32(5)
memory usage: 26.7 KB


---

## Drop rounds where contestants all overbid

In [3]:
# Create a mask/filter where all contestants placed bids greater than 'retail'
mask = df.drop(columns='retail').gt(df['retail'],axis=0).all(axis=1)
len(df[mask])

25

25 dates where overbids happened. Let's drop these

In [4]:
overbid_dates = df.loc[mask].index # Get the dates where an extra round was played due to overbids
df = df.drop(overbid_dates) # Drop the rows overbid dates
len(df)

929

In [5]:
len(og_df) - len(df) # Amount of rounds that were dropped due to 4 players overbidding

49

## Calculations to Determine Winner

In [6]:
# Calculates the differences in price from the guesses and retail
# retail - (first/second/third/fourth)
diffs = df[['first','second','third','fourth']].rsub(df['retail'],axis=0)
diffs.head()

Unnamed: 0_level_0,first,second,third,fourth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1972-09-04,245,-305,445,-405
1981-02-12,470,445,195,444
1991-11-14,570,200,550,500
1986-09-30,447,599,664,349
1992-12-01,1001,1382,949,948


In [7]:
# Drop any guesses that go over (negative values) and replace with NaN's
valid_diffs = diffs.mask(diffs < 0, other=pd.NA)
valid_diffs.head()

Unnamed: 0_level_0,first,second,third,fourth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1972-09-04,245.0,,445.0,
1981-02-12,470.0,445.0,195.0,444.0
1991-11-14,570.0,200.0,550.0,500.0
1986-09-30,447.0,599.0,664.0,349.0
1992-12-01,1001.0,1382.0,949.0,948.0


---
## Number of Winners on Highest & Lowest Bid

In [8]:
# Display the contestant that won the game
winners = valid_diffs.idxmin(axis=1)
winners.head()

date
1972-09-04     first
1981-02-12     third
1991-11-14    second
1986-09-30    fourth
1992-12-01    fourth
dtype: object

In [9]:
winners_count = winners.value_counts().reindex(['first', 'second', 'third', 'fourth'])
winners_count

first     169
second    185
third     212
fourth    363
dtype: int64

In [10]:
# Display the contestant that bid the highest
highest_bidder = df.drop(columns='retail').idxmax(axis=1)
# Find the number of Highest Bid Winners
highest_bid_winner = (highest_bidder == winners)
highest_bid_winner.value_counts()

True     595
False    334
dtype: int64

In [11]:
num_highest_bid_winners = highest_bid_winner.value_counts().loc[True]

In [12]:
# Display the contestant that bid the lowset
lowest_bidder = df.drop(columns='retail').idxmin(axis=1)
# Find the number of Highest Bid Winners
lowest_bid_winner = (lowest_bidder == winners)
lowest_bid_winner.value_counts()

False    850
True      79
dtype: int64

In [13]:
num_lowest_bid_winners = lowest_bid_winner.value_counts().loc[True]

In [14]:
print(f'Number of Highest Bid Winners: {num_highest_bid_winners}')
print(f'Pct of Highest Bid Winners: {num_highest_bid_winners/len(df):.2f}')
print(f'Number of Lowest Bid Winners: {num_lowest_bid_winners}')
print(f'Pct of Lowest Bid Winners: {num_lowest_bid_winners/len(df):.2f}')

Number of Highest Bid Winners: 595
Pct of Highest Bid Winners: 0.64
Number of Lowest Bid Winners: 79
Pct of Lowest Bid Winners: 0.09


---

## Pearson chi-squared goodness-of-fit test using Scipy.
    
    -Null hypothesis that each player has probability of 0.25 of winning
    -3 Degrees of Freedom
    -1% Significnce Level
    -Compared to 11.345 critical value

In [15]:
from scipy.stats import chisquare
from scipy.stats import chi2

# Get observed frequencies in a consistent order
observed = winners.value_counts().reindex(['first', 'second', 'third', 'fourth'])

# Create expected frequencies (equal distribution)
total = observed.sum()
expected = ([float(total / 4)] * 4)

# Run chi-squared test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

print("Observed:", observed.values)
print("Expected:", expected)
print(f"Chi-squared statistic: {chi2_stat:.3f}")
print(f"p-value: {p_value:.3e}")

# Significance level (e.g., 0.01 for 99% confidence)
alpha = 0.01
# Degrees of freedom
dof = 3
# Critical value (right-tail)
critical_value = chi2.ppf(1 - alpha, dof)
print(f"Critical value at α = {alpha} and dof = {dof}: {critical_value:.3f}")

Observed: [169 185 212 363]
Expected: [232.25, 232.25, 232.25, 232.25]
Chi-squared statistic: 102.212
p-value: 5.198e-22
Critical value at α = 0.01 and dof = 3: 11.345


> Since our calculated value for the Pearson chi-squared goodness-of-fit test of 102.212 exceeds the critical value of 11.345, we can reject the null hypothesis.

---
## Replacing the final bid with previous highest bid + 1

In [16]:
df['fourth']=df[['first','second','third']].max(axis=1)+1
df.head()

Unnamed: 0_level_0,first,second,third,fourth,retail
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1972-09-04,350,900,150,901,595
1981-02-12,625,650,900,901,1095
1991-11-14,1130,1500,1150,1501,1700
1986-09-30,1002,850,785,1003,1449
1992-12-01,950,569,1002,1003,1951


In [17]:
# Calculates the differences in price from the guesses and retail
# retail - (first/second/third/fourth)
diffs_2 = df[['first','second','third','fourth']].rsub(df['retail'],axis=0)
valid_diffs2 = diffs_2.mask(diffs_2 < 0, other=pd.NA)
valid_diffs2.dropna(how='all',inplace=True)
valid_diffs2.head()

Unnamed: 0_level_0,first,second,third,fourth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1972-09-04,245.0,,445.0,
1981-02-12,470.0,445.0,195.0,194.0
1991-11-14,570.0,200.0,550.0,199.0
1986-09-30,447.0,599.0,664.0,446.0
1992-12-01,1001.0,1382.0,949.0,948.0


In [18]:
winners2 = valid_diffs2.idxmin(axis=1)
winners2

observed2 = winners2.value_counts().reindex(['first', 'second', 'third', 'fourth'])
#print(observed2)
print('Changing all Fourth Bids to $1 Higher Than Previous Higher Bid')
print(f'New Valid Round Total: {len(winners2)}')
print('--------------------------------------------------------------')
print(f'Number of Winners From Position 1: {observed2.loc["first"]}')
print(f'Number of Winners From Position 2: {observed2.loc["second"]}')
print(f'Number of Winners From Position 3: {observed2.loc["third"]}')
print(f'Number of Winners From Position 4: {observed2.loc["fourth"]}')

print(f'Percent of Winners From Position 1: {observed2.loc["first"]/901:.3f}')
print(f'Percent of Winners From Position 2: {observed2.loc["second"]/901:.3f}')
print(f'Percent of Winners From Position 3: {observed2.loc["third"]/901:.3f}')
print(f'Percent of Winners From Position 4: {observed2.loc["fourth"]/901:.3f}')

Changing all Fourth Bids to $1 Higher Than Previous Higher Bid
New Valid Round Total: 901
--------------------------------------------------------------
Number of Winners From Position 1: 85
Number of Winners From Position 2: 95
Number of Winners From Position 3: 116
Number of Winners From Position 4: 605
Percent of Winners From Position 1: 0.094
Percent of Winners From Position 2: 0.105
Percent of Winners From Position 3: 0.129
Percent of Winners From Position 4: 0.671
