# Validation of Statistical Method for TPIR Contestant Fairness

In [6]:
import pandas as pd
import numpy as np

In [7]:
# Cleaning up the DataFrame
df = pd.read_excel("./price_is_right.xlsx", header=None).transpose()
df.columns = df.iloc[0]
df = df.drop(index=0).reset_index(drop=True)
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
df.drop(columns=df.columns[0],inplace=True)
df = df.drop_duplicates()
df

Unnamed: 0_level_0,first,second,third,fourth,retail
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1972-09-04,350,900,150,1000,595
1981-02-12,625,650,900,651,1095
1991-11-14,1130,1500,1150,1200,1700
1986-09-30,1002,850,785,1100,1449
1992-12-01,950,569,1002,1003,1951
...,...,...,...,...,...
1992-03-17,625,850,950,725,1083
1994-04-24,528,1050,750,1300,1800
1996-05-02,450,550,1500,551,1288
1995-09-14,700,600,500,550,1005


In [8]:
df = df.astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 978 entries, 1972-09-04 to 1996-05-03
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   first   978 non-null    int32
 1   second  978 non-null    int32
 2   third   978 non-null    int32
 3   fourth  978 non-null    int32
 4   retail  978 non-null    int32
dtypes: int32(5)
memory usage: 26.7 KB


In [9]:
# Calculates the differences in price from the guesses and retail
# retail - (first/second/third/fourth)
diffs = df[['first','second','third','fourth']].rsub(df['retail'],axis=0)

In [10]:
diffs.head()

Unnamed: 0_level_0,first,second,third,fourth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1972-09-04,245,-305,445,-405
1981-02-12,470,445,195,444
1991-11-14,570,200,550,500
1986-09-30,447,599,664,349
1992-12-01,1001,1382,949,948


In [32]:
# Drop any guesses that go over (negative values) and replace with NaN's
valid_diffs = diffs.mask(diffs < 0, other=pd.NA)
valid_diffs.head()

Unnamed: 0_level_0,first,second,third,fourth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1972-09-04,245.0,,445.0,
1981-02-12,470.0,445.0,195.0,444.0
1991-11-14,570.0,200.0,550.0,500.0
1986-09-30,447.0,599.0,664.0,349.0
1992-12-01,1001.0,1382.0,949.0,948.0


In [22]:
# Drop any rounds where all contestants go over (rows with all NaN's)
valid_diffs.dropna(how='all',inplace=True)
valid_diffs.head(5)

Unnamed: 0_level_0,first,second,third,fourth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1972-09-04,245.0,,445.0,
1981-02-12,470.0,445.0,195.0,444.0
1991-11-14,570.0,200.0,550.0,500.0
1986-09-30,447.0,599.0,664.0,349.0
1992-12-01,1001.0,1382.0,949.0,948.0


> Total Number of Valid Rounds (rounds with a winner)

In [23]:
len(valid_diffs)

953

> Number of rounds where early player overbid

In [24]:
len(df) - len(valid_diffs)

25

> Number of Winners on Highest & Lowest Bid

In [33]:
ct=0
tot=0
new_lst=[]
tot_nan=[]
for a in valid_diffs.iloc:
    if 'n' not in [str(a['first'])[0], str(a['second'])[0], str(a['third'])[0], str(a['fourth'])[0]]:
        ct+=1
    
    
    new_lst=[str(a['first'])[0], str(a['second'])[0], str(a['third'])[0], str(a['fourth'])[0]]
    new_lst=[a for a in new_lst if a=='n']
    if len(new_lst)==3:
        tot+=1
        continue
    continue

print('Number of Highest Bid Winners: {}'.format(ct))
print('Number of Lowest Bid Winners: {}'.format(tot))

Number of Highest Bid Winners: 609
Number of Lowest Bid Winners: 82


> Total Number of Overbids and Total Bids (including invalid rounds)

In [53]:
tot_tot=0
neg=0
new_lst2=[]
for a in diffs.iloc:
    new_lst2=[str(a['first'])[0], str(a['second'])[0], str(a['third'])[0], str(a['fourth'])[0]]
    tot_tot+=len(new_lst2)
    new_lst2=[a for a in new_lst2 if a=='-']
    neg+=len(new_lst2)
    continue
print('Number of Overbids: {}'.format(neg))        
print('Number of Total Bids: {}'.format(tot_tot))   

Number of Overbids: 706
Number of Total Bids: 3912


In [25]:
# Find the winners by finding the guess that has the least magnitude
winners = valid_diffs.idxmin(axis=1)
winners.head(10)

date
1972-09-04     first
1981-02-12     third
1991-11-14    second
1986-09-30    fourth
1992-12-01    fourth
1990-04-23    fourth
1986-12-05     first
1991-09-10     third
1985-10-15    fourth
1976-03-11     first
dtype: object

> Number of Winners by Bidding Position

In [26]:
winners_count = winners.value_counts().reindex(['first', 'second', 'third', 'fourth'])
winners_count

first     176
second    187
third     220
fourth    370
dtype: int64

> Pearson chi-squared goodness-of-fit test using Scipy.
    
    -Null hypothesis that each player has probability of 0.25 of winning
    -3 Degrees of Freedom
    -1% Significnce Level
    -Compared to 11.345 critical value

In [32]:
from scipy.stats import chisquare
from scipy.stats import chi2

# Get observed frequencies in a consistent order
observed = winners.value_counts().reindex(['first', 'second', 'third', 'fourth'])

# Create expected frequencies (equal distribution)
total = observed.sum()
expected = [total / 4] * 4

# Run chi-squared test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

print("Observed:", observed.values)
print("Expected:", expected)
print(f"Chi-squared statistic: {chi2_stat:.3f}")
print(f"p-value: {p_value:.3e}")

# Significance level (e.g., 0.01 for 99% confidence)
alpha = 0.01
# Degrees of freedom
dof = 3
# Critical value (right-tail)
critical_value = chi2.ppf(1 - alpha, dof)
print(f"Critical value at α = {alpha} and dof = {dof}: {critical_value:.3f}")

Observed: [176 187 220 370]
Expected: [238.25, 238.25, 238.25, 238.25]
Chi-squared statistic: 101.544
p-value: 7.237e-22
Critical value at α = 0.01 and dof = 3: 11.345


> Since our calculated value for the Pearson chi-squared goodness-of-fit test of 101.544 exceeds the critical value of 11.345, we can reject the null hypothesis.

> Replacing the final bid with 1 more than previous highest bid

In [26]:
new_fourth=[max(d['first'], d['second'], d['third'])+1 for d in df.iloc]
df['fourth']=new_fourth

In [27]:
# Calculates the differences in price from the guesses and retail
# retail - (first/second/third/fourth)
diffs_2 = df[['first','second','third','fourth']].rsub(df['retail'],axis=0)
diffs_2.head()
valid_diffs2 = diffs_2.mask(diffs_2 < 0, other=pd.NA)
valid_diffs2.dropna(how='all',inplace=True)
valid_diffs2.head(5)

Unnamed: 0_level_0,first,second,third,fourth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1972-09-04,245.0,,445.0,
1981-02-12,470.0,445.0,195.0,194.0
1991-11-14,570.0,200.0,550.0,199.0
1986-09-30,447.0,599.0,664.0,446.0
1992-12-01,1001.0,1382.0,949.0,948.0


In [36]:
winners2 = valid_diffs2.idxmin(axis=1)
winners2

observed2 = winners2.value_counts().reindex(['first', 'second', 'third', 'fourth'])
#print(observed2)
print('Changing all Fourth Bids to $1 Higher Than Previous Higher Bid')
print('New Valid Round Total: {}'.format(observed2['first']+observed2['second']+observed2['third']+observed2['fourth']))
print('--------------------------------------------------------------')
print('Number of Winners From Position 1: {}'.format(observed2['first']))
print('Number of Winners From Position 2: {}'.format(observed2['second']))
print('Number of Winners From Position 3: {}'.format(observed2['third']))
print('Number of Winners From Position 4: {}'.format(observed2['fourth']))

Changing all Fourth Bids to $1 Higher Than Previous Higher Bid
New Valid Round Total: 924
--------------------------------------------------------------
Number of Winners From Position 1: 88
Number of Winners From Position 2: 96
Number of Winners From Position 3: 121
Number of Winners From Position 4: 619
