In [3]:
import pandas as pd
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
import seaborn as sns

# Load the CSV files
bids_df = pd.read_csv('bids.csv')
sparse_attributes_df = pd.read_csv('sparse_attributes.csv')
items_df = pd.read_csv('items.csv')

# Display basic information about the loaded datasets
print("Bids dataset shape:", bids_df.shape)
print("Sparse attributes dataset shape:", sparse_attributes_df.shape)
print("Items dataset shape:", items_df.shape)

# Convert sparse attributes to a sparse matrix
# Assuming the columns are [row, col, value] in that order
row_indices = sparse_attributes_df.iloc[:, 0].values
col_indices = sparse_attributes_df.iloc[:, 1].values
values = sparse_attributes_df.iloc[:, 2].values

# Determine matrix dimensions
max_row = int(row_indices.max())
max_col = int(col_indices.max())

# Create sparse matrix using scipy's CSR format
sparse_matrix = sparse.csr_matrix(
    (values, (row_indices, col_indices)), 
    shape=(max_row + 1, max_col + 1)
)

print(f"\nSparse matrix shape: {sparse_matrix.shape}")
print(f"Number of non-zero elements: {sparse_matrix.nnz}")
print(f"Sparsity: {sparse_matrix.nnz / (sparse_matrix.shape[0] * sparse_matrix.shape[1]) * 100:.4f}%")

# Calculate log-bids
bids_df['log_bid_value'] = np.log(bids_df['bid_value'])

# ----- ENHANCED AUCTION SUMMARY STATISTICS -----
print("\n=== ENHANCED AUCTION SUMMARY STATISTICS ===")

# Basic auction metrics
n_auctions = bids_df['item_num'].nunique()
total_bids = len(bids_df)
avg_bids_per_auction = bids_df.groupby('item_num').size().mean()
median_bids_per_auction = bids_df.groupby('item_num').size().median()

print(f"\nBasic Auction Metrics:")
print(f"Total number of auctions: {n_auctions}")
print(f"Total number of bids: {total_bids}")
print(f"Average bids per auction: {avg_bids_per_auction:.2f}")
print(f"Median bids per auction: {median_bids_per_auction:.2f}")

# Compute auction-level bid statistics
auction_bid_counts = bids_df.groupby('item_num').size()
print("\nAuction Participation Distribution:")
print(f"Auctions with only 1 bid: {sum(auction_bid_counts == 1)} ({sum(auction_bid_counts == 1)/n_auctions*100:.2f}%)")
print(f"Auctions with 2-5 bids: {sum((auction_bid_counts >= 2) & (auction_bid_counts <= 5))} ({sum((auction_bid_counts >= 2) & (auction_bid_counts <= 5))/n_auctions*100:.2f}%)")
print(f"Auctions with 6-10 bids: {sum((auction_bid_counts >= 6) & (auction_bid_counts <= 10))} ({sum((auction_bid_counts >= 6) & (auction_bid_counts <= 10))/n_auctions*100:.2f}%)")
print(f"Auctions with >10 bids: {sum(auction_bid_counts > 10)} ({sum(auction_bid_counts > 10)/n_auctions*100:.2f}%)")
print(f"Maximum bids in a single auction: {auction_bid_counts.max()}")

# Bid spread within auctions
auction_stats = bids_df.groupby('item_num').agg({
    'bid_value': ['count', 'min', 'max', 'mean', 'std', lambda x: x.max() - x.min()],
    'log_bid_value': ['min', 'max', 'mean', 'std', lambda x: x.max() - x.min()]
})

# Rename the columns for better readability
auction_stats.columns = ['_'.join(col).strip() for col in auction_stats.columns.values]
auction_stats = auction_stats.rename(columns={
    'bid_value_count': 'num_bids',
    'bid_value_min': 'min_bid',
    'bid_value_max': 'max_bid',
    'bid_value_mean': 'mean_bid',
    'bid_value_std': 'std_bid',
    'bid_value_<lambda_0>': 'bid_range',
    'log_bid_value_min': 'min_log_bid',
    'log_bid_value_max': 'max_log_bid',
    'log_bid_value_mean': 'mean_log_bid',
    'log_bid_value_std': 'std_log_bid',
    'log_bid_value_<lambda_0>': 'log_bid_range'
})

# Calculate price spread as percentage of mean bid
auction_stats['price_spread_pct'] = (auction_stats['bid_range'] / auction_stats['mean_bid']) * 100

# Summarize auction-level statistics
print("\nAuction-Level Bid Statistics:")
print(auction_stats.describe().round(2))

# Calculate ratio of max bid to min bid to measure competition intensity
auction_stats['max_min_bid_ratio'] = auction_stats['max_bid'] / auction_stats['min_bid']

print("\nCompetition Intensity Metrics:")
print(f"Mean ratio of max to min bid: {auction_stats['max_min_bid_ratio'].mean():.2f}")
print(f"Median ratio of max to min bid: {auction_stats['max_min_bid_ratio'].median():.2f}")
print(f"Mean price spread (% of mean bid): {auction_stats['price_spread_pct'].mean():.2f}%")
print(f"Median price spread (% of mean bid): {auction_stats['price_spread_pct'].median():.2f}%")

# Analyze auctions by number of participants
# Merge with items data to get predicted participant counts
merged_data = pd.merge(bids_df, items_df, on='item_num', how='left')

# Group by predicted number of participants
participant_stats = merged_data.groupby('pred_n_participant').agg({
    'item_num': 'nunique',
    'bid_value': ['count', 'mean', 'median', 'std', 'max'],
    'log_bid_value': ['mean', 'median', 'std']
}).round(2)

# Rename columns for clarity
participant_stats.columns = ['_'.join(col).strip() for col in participant_stats.columns.values]
participant_stats = participant_stats.rename(columns={
    'item_num_nunique': 'num_auctions',
    'bid_value_count': 'total_bids',
    'bid_value_mean': 'mean_bid',
    'bid_value_median': 'median_bid',
    'bid_value_std': 'std_bid',
    'bid_value_max': 'max_bid',
    'log_bid_value_mean': 'mean_log_bid',
    'log_bid_value_median': 'median_log_bid',
    'log_bid_value_std': 'std_log_bid'
})

# Calculate average bids per auction for each participant count
participant_stats['avg_bids_per_auction'] = participant_stats['total_bids'] / participant_stats['num_auctions']

print("\nAuction Statistics by Predicted Number of Participants:")
print(participant_stats)

# Calculate winning bid statistics (assuming max bid is the winning bid)
winning_bids = bids_df.groupby('item_num')['bid_value'].max().reset_index()
winning_bids['log_winning_bid'] = np.log(winning_bids['bid_value'])

print("\nWinning Bid Statistics:")
print(winning_bids['bid_value'].describe().round(2))
print("\nLog-Winning Bid Statistics:")
print(winning_bids['log_winning_bid'].describe().round(2))

# Create a LaTeX table with the summary statistics for auctions
latex_table = f"""
\\begin{{table}}[htbp]
    \\centering
    \\caption{{Enhanced Auction Summary Statistics}}
    \\begin{{tabular}}{{lr}}
        \\hline
        \\textbf{{Metric}} & \\textbf{{Value}} \\\\
        \\hline
        Total number of auctions & {n_auctions:,} \\\\
        Total number of bids & {total_bids:,} \\\\
        Average bids per auction & {avg_bids_per_auction:.2f} \\\\
        Median bids per auction & {median_bids_per_auction:.2f} \\\\
        Maximum bids in a single auction & {auction_bid_counts.max()} \\\\
        \\hline
        \\multicolumn{{2}}{{c}}{{\\textbf{{Auction Participation}}}} \\\\
        \\hline
        Auctions with only 1 bid & {sum(auction_bid_counts == 1):,} ({sum(auction_bid_counts == 1)/n_auctions*100:.1f}\\%) \\\\
        Auctions with 2-5 bids & {sum((auction_bid_counts >= 2) & (auction_bid_counts <= 5)):,} ({sum((auction_bid_counts >= 2) & (auction_bid_counts <= 5))/n_auctions*100:.1f}\\%) \\\\
        Auctions with 6-10 bids & {sum((auction_bid_counts >= 6) & (auction_bid_counts <= 10)):,} ({sum((auction_bid_counts >= 6) & (auction_bid_counts <= 10))/n_auctions*100:.1f}\\%) \\\\
        Auctions with >10 bids & {sum(auction_bid_counts > 10):,} ({sum(auction_bid_counts > 10)/n_auctions*100:.1f}\\%) \\\\
        \\hline
        \\multicolumn{{2}}{{c}}{{\\textbf{{Competition Intensity}}}} \\\\
        \\hline
        Mean max/min bid ratio & {auction_stats['max_min_bid_ratio'].mean():.2f} \\\\
        Median max/min bid ratio & {auction_stats['max_min_bid_ratio'].median():.2f} \\\\
        Mean price spread (\\% of mean bid) & {auction_stats['price_spread_pct'].mean():.2f}\\% \\\\
        Median price spread (\\% of mean bid) & {auction_stats['price_spread_pct'].median():.2f}\\% \\\\
        \\hline
    \\end{{tabular}}
    \\label{{tab:auction_summary}}
\\end{{table}}
"""

print("\nLaTeX Table for Auction Summary Statistics:")
print(latex_table)

# Original code continues below
print("\n--- Original Analysis Results ---")

# Original statistics per auction (grouped by item_num)
original_auction_stats = bids_df.groupby('item_num').agg({
    'bid_value': ['count', 'min', 'max', 'mean', 'std'],
    'log_bid_value': ['mean', 'std']
})

print("\nStatistics per Auction (first 5 items):")
print(original_auction_stats.head())

# Original analyze relationship between number of participants and bid values
original_participant_bid_stats = merged_data.groupby('pred_n_participant').agg({
    'bid_value': ['count', 'mean', 'std', 'max'],
    'log_bid_value': ['mean', 'std']
})

print("\nBid Statistics by Number of Participants:")
print(original_participant_bid_stats)

# Original calculate correlation between number of participants and max bid
max_bids = merged_data.groupby(['item_num', 'pred_n_participant'])['bid_value'].max().reset_index()
correlation = max_bids['pred_n_participant'].corr(max_bids['bid_value'])
print(f"\nCorrelation between number of participants and maximum bid: {correlation:.4f}")

# Original print summary of the sparse attributes
print("\n=== Sparse Attributes Summary ===")
print(f"Number of unique words (features): {sparse_matrix.shape[1]}")
print(f"Number of items with attributes: {len(np.unique(row_indices))}")

Bids dataset shape: (41559, 2)
Sparse attributes dataset shape: (33857, 3)
Items dataset shape: (6983, 2)

Sparse matrix shape: (6984, 501)
Number of non-zero elements: 33857
Sparsity: 0.9676%

=== ENHANCED AUCTION SUMMARY STATISTICS ===

Basic Auction Metrics:
Total number of auctions: 6983
Total number of bids: 41559
Average bids per auction: 5.95
Median bids per auction: 5.00

Auction Participation Distribution:
Auctions with only 1 bid: 0 (0.00%)
Auctions with 2-5 bids: 3655 (52.34%)
Auctions with 6-10 bids: 2822 (40.41%)
Auctions with >10 bids: 506 (7.25%)
Maximum bids in a single auction: 22

Auction-Level Bid Statistics:
       num_bids  min_bid   max_bid  mean_bid  std_bid  bid_range  min_log_bid  \
count   6983.00  6983.00   6983.00   6983.00  6983.00    6983.00      6983.00   
mean       5.95    20.58     71.52     45.93    19.53      50.94         2.28   
std        2.79   132.89    331.61    191.81   114.70     272.07         1.02   
min        3.00     0.01      1.50      

In [2]:
auction_stats

Unnamed: 0_level_0,bid_value,bid_value,bid_value,bid_value,bid_value,log_bid_value,log_bid_value
Unnamed: 0_level_1,count,min,max,mean,std,mean,std
item_num,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,10,20.00,511.99,256.300000,208.849457,4.992998,1.313813
2,7,151.08,264.85,223.831429,41.982699,5.393636,0.207559
3,10,22.00,252.50,173.195000,70.293477,4.999617,0.726809
4,8,26.00,150.50,112.422500,39.872094,4.619290,0.573467
5,7,75.00,160.00,126.717143,29.730064,4.814676,0.262329
...,...,...,...,...,...,...,...
6996,10,1.00,88.00,55.355000,27.301897,3.649937,1.335520
6997,7,3.57,15.00,9.368571,4.150484,2.133104,0.522932
6998,6,4.00,15.00,8.428333,4.244622,2.026465,0.502981
6999,4,5.00,10.50,7.500000,2.345208,1.978014,0.314545
