In [1]:
import pandas as pd
import numpy as np
from lightgbm import early_stopping, log_evaluation

In [73]:
training_data = pd.read_csv('train.csv')  # 188533 rows, 13 columns - last column is price
test_data = pd.read_csv('test.csv')  # 125690 rows

training_data = training_data.fillna('unknown')
test_data = test_data.fillna('unknown')
def encode_columns(df):
    df['hp'] = df['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float)
    quantiles = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    df['hp_bin'] = pd.cut(df['hp'], bins=    bin_edges = df['hp'].quantile(quantiles).values
, labels=False, include_lowest=True) #bucket into 11 unique (was originally 348)
    df['hp_bin'].fillna(-1)

    df['cylinder'] = df['engine'].str.extract(r'(\d+\.?\d*) Cylinder').astype(float) #7 unique

    df = df.drop(columns=['engine'])
    
    # Extract 'got_V' from 'model' column and convert to binary indicator
    df['got_V'] = df['model'].str.extract(r'(\d+\.?\d*) V').notna().astype(int)
    
    return df


training_data = encode_columns(training_data)
test_data = encode_columns(test_data)

ValueError: Input array must be 1 dimensional

348

In [6]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

test_data['price'] = 0  # Add a dummy price column to test data
all_data = pd.concat([training_data, test_data], ignore_index=True)

categorical_features = ['brand', 'model', 'fuel_type', 'transmission', 'engine', 'ext_col', 'int_col', 'accident', 'clean_title']

# Fill missing categorical values
for nan_col in ['fuel_type', 'clean_title', 'brand', 'accident']:
    all_data[f"{nan_col} _nan"] = all_data[nan_col].isna()
all_data.fillna({'fuel_type': 'unknown', 'clean_title': 'unknown', 'brand': 'unknown', 'accident': 'unknown'}, inplace=True)
X_data = all_data.drop(columns=['price'])

for column in categorical_features:
    value_counts = X_data[column].value_counts()
    infrequent_values = value_counts[value_counts <= 169].index

    # Replace infrequent values with 'other'
    X_data[column] = X_data[column].replace(infrequent_values, 'other')

# Perform one-hot encoding on all updated categorical columns at once
X_data_one_hot = pd.get_dummies(X_data, columns=categorical_features)

# Replace the original DataFrame with the one-hot encoded version
X_data = X_data_one_hot


scaler = StandardScaler()
numerical = ['model_year', 'milage']
X_data[numerical] = pd.DataFrame(scaler.fit_transform(X_data[numerical]), columns=X_data[numerical].columns)

In [7]:
A = np.random.randn(X_data.shape[1], X_data.shape[1]) 
Q, R = np.linalg.qr(A)

In [11]:
X_data = X_data @ Q

In [5]:
X_data = X_data.drop(columns = 'id')
y_train = training_data['price']

X_data = X_data.to_numpy()
y_train = y_train.to_numpy()

In [13]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

fold_number = 3
num_models = 20

models = []
model_preds_train = np.zeros(shape=(num_models, 188533))
model_preds_test = np.zeros(shape=(num_models, 125690))

params = {
        'objective': 'regression',
        'metric': 'rmse',  # Root Mean Squared Error
        'num_leaves': 500,
        'max_depth': 20,
        'learning_rate': 0.01,
        'n_estimators': 20000,   #it literally always early stops before it reaches here so no worries about this i think LOL
        'subsample': 0.6,
        'colsample_bytree': 0.9,
        'reg_lambda': 0.4,
        'min_data_in_leaf': 50,
        'feature_fraction': 0.75,
        'verbose': -1
        }

for model_number in range(num_models):
    A = np.random.randn(X_data.shape[1], X_data.shape[1]) 
    Q, R = np.linalg.qr(A)
    X_data = X_data @ Q
    X_train = X_data[0:188533]
    fold_size = 188533 // fold_number
    indices = np.arange(188533)
    np.random.shuffle(indices)    
    train_predictions = np.zeros(188533)
    test_predictions = np.zeros(125690)

    print(f"Now training model number {model_number}")

    for fold_idx in range(fold_number):
        val_indices = indices[fold_idx* fold_size:(fold_idx+ 1) * fold_size]
        train_indices = np.concatenate([indices[:fold_idx* fold_size], indices[(fold_idx+ 1) * fold_size:]])

        X_train_fold, y_train_fold = X_train[train_indices], y_train[train_indices]
        X_val_fold, y_val_fold = X_train[val_indices], y_train[val_indices]

        train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
        valid_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)

        model = lgb.train(
            params,
            train_data,
            num_boost_round=params['n_estimators'], 
            valid_sets=[valid_data],
            callbacks=[lgb.early_stopping(stopping_rounds=50)], 
        )

        train_predictions[val_indices] = model.predict(X_val_fold, num_iteration=model.best_iteration)
        test_predictions += model.predict(X_data[188533:314223], num_iterations=model.best_iteration) * (1/fold_number)
    train_predictions[0] = y_train[0]
    print(mean_squared_error(train_predictions, y_train) ** (1/2))
    if(mean_squared_error(train_predictions, y_train) ** (1/2) < 72500):
        model_preds_train[model_number] = train_predictions
        model_preds_test[model_number] = test_predictions

In [10]:
from sklearn.linear_model import LinearRegression
model_preds_train = model_preds_train.T
meta_model = LinearRegression()
meta_model.fit(model_preds_train, y_train)
model_preds_test = model_preds_test.T
y_pred = meta_model.predict(model_preds_test)

ValueError: Found input variables with inconsistent numbers of samples: [20, 188533]

0.0

In [8]:
for idx in range(len(y_pred)):
    if(y_pred[idx] < 8000):
        y_pred[idx] = 8000
id = np.array([idx for idx in range(188533, 314223)])
prediction = pd.DataFrame({'id' : id, 'price' : y_pred})

In [9]:
prediction.to_csv('09182024carsprediction1.csv', index = False)