In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
import seaborn as sns

# Load the CSV files
bids_df = pd.read_csv('bids.csv')
sparse_attributes_df = pd.read_csv('sparse_attributes.csv')
items_df = pd.read_csv('items.csv')

# Display basic information about the loaded datasets
print("Bids dataset shape:", bids_df.shape)
print("Sparse attributes dataset shape:", sparse_attributes_df.shape)
print("Items dataset shape:", items_df.shape)

# Convert sparse attributes to a sparse matrix
# Assuming the columns are [row, col, value] in that order
row_indices = sparse_attributes_df.iloc[:, 0].values
col_indices = sparse_attributes_df.iloc[:, 1].values
values = sparse_attributes_df.iloc[:, 2].values

# Determine matrix dimensions
max_row = int(row_indices.max())
max_col = int(col_indices.max())

# Create sparse matrix using scipy's CSR format
sparse_matrix = sparse.csr_matrix(
    (values, (row_indices, col_indices)), 
    shape=(max_row + 1, max_col + 1)
)

print(f"\nSparse matrix shape: {sparse_matrix.shape}")
print(f"Number of non-zero elements: {sparse_matrix.nnz}")
print(f"Sparsity: {sparse_matrix.nnz / (sparse_matrix.shape[0] * sparse_matrix.shape[1]) * 100:.4f}%")

# Calculate log-bids
bids_df['log_bid_value'] = np.log(bids_df['bid_value'])

# 1. Report relevant summary statistics for auctions and log-bids
print("\n=== Auction Summary Statistics ===")

# Overall bid statistics
print("\nOverall Bid Statistics:")
print(f"Total number of bids: {len(bids_df)}")
print(f"Number of unique items with bids: {bids_df['item_num'].nunique()}")
print(f"Average bids per item: {bids_df.groupby('item_num').size().mean():.2f}")

# Bid value statistics
bid_stats = bids_df['bid_value'].describe()
log_bid_stats = bids_df['log_bid_value'].describe()

print("\nBid Value Statistics:")
for stat, value in bid_stats.items():
    print(f"{stat}: {value:.2f}")

print("\nLog-Bid Value Statistics:")
for stat, value in log_bid_stats.items():
    print(f"{stat}: {value:.2f}")

# Statistics per auction (grouped by item_num)
auction_stats = bids_df.groupby('item_num').agg({
    'bid_value': ['count', 'min', 'max', 'mean', 'std'],
    'log_bid_value': ['mean', 'std']
})

print("\nStatistics per Auction (first 5 items):")
print(auction_stats.head())

# Combine bids data with items data
merged_data = pd.merge(bids_df, items_df, on='item_num', how='left')

# Analyze relationship between number of participants and bid values
participant_bid_stats = merged_data.groupby('pred_n_participant').agg({
    'bid_value': ['count', 'mean', 'std', 'max'],
    'log_bid_value': ['mean', 'std']
})

print("\nBid Statistics by Number of Participants:")
print(participant_bid_stats)

# Calculate correlation between number of participants and max bid
max_bids = merged_data.groupby(['item_num', 'pred_n_participant'])['bid_value'].max().reset_index()
correlation = max_bids['pred_n_participant'].corr(max_bids['bid_value'])
print(f"\nCorrelation between number of participants and maximum bid: {correlation:.4f}")

# Print summary of the sparse attributes
print("\n=== Sparse Attributes Summary ===")
print(f"Number of unique words (features): {sparse_matrix.shape[1]}")
print(f"Number of items with attributes: {len(np.unique(row_indices))}")

Bids dataset shape: (41559, 2)
Sparse attributes dataset shape: (33857, 3)
Items dataset shape: (6983, 2)

Sparse matrix shape: (6984, 501)
Number of non-zero elements: 33857
Sparsity: 0.9676%

=== Auction Summary Statistics ===

Overall Bid Statistics:
Total number of bids: 41559
Number of unique items with bids: 6983
Average bids per item: 5.95

Bid Value Statistics:
count: 41559.00
mean: 56.72
std: 259.24
min: 0.01
25%: 11.00
50%: 20.32
75%: 50.01
max: 19000.00

Log-Bid Value Statistics:
count: 41559.00
mean: 3.19
std: 1.15
min: -4.61
25%: 2.40
50%: 3.01
75%: 3.91
max: 9.85

Statistics per Auction (first 5 items):
         bid_value                                         log_bid_value  \
             count     min     max        mean         std          mean   
item_num                                                                   
1               10   20.00  511.99  256.300000  208.849457      4.992998   
2                7  151.08  264.85  223.831429   41.982699      5.39363

In [2]:
auction_stats

Unnamed: 0_level_0,bid_value,bid_value,bid_value,bid_value,bid_value,log_bid_value,log_bid_value
Unnamed: 0_level_1,count,min,max,mean,std,mean,std
item_num,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,10,20.00,511.99,256.300000,208.849457,4.992998,1.313813
2,7,151.08,264.85,223.831429,41.982699,5.393636,0.207559
3,10,22.00,252.50,173.195000,70.293477,4.999617,0.726809
4,8,26.00,150.50,112.422500,39.872094,4.619290,0.573467
5,7,75.00,160.00,126.717143,29.730064,4.814676,0.262329
...,...,...,...,...,...,...,...
6996,10,1.00,88.00,55.355000,27.301897,3.649937,1.335520
6997,7,3.57,15.00,9.368571,4.150484,2.133104,0.522932
6998,6,4.00,15.00,8.428333,4.244622,2.026465,0.502981
6999,4,5.00,10.50,7.500000,2.345208,1.978014,0.314545
