In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
training_data = pd.read_csv('train.csv') #188533 rows, 13 columns - last column is price
test_data = pd.read_csv('test.csv') #125690 rows
test_data['price'] = 0
all_data = pd.concat([training_data, test_data], ignore_index = True)
#clean_title, accident, fuel_type have "NaN" types
all_data = all_data.drop(columns = ["id", "model", "ext_col", "int_col", "clean_title", "engine"])
columns_to_process = ['brand', 'fuel_type', 'transmission']
all_data.fillna({'fuel_type' : 'unknown'}, inplace = True)
for column in columns_to_process:
    mean_values = all_data.groupby(column)['price'].mean()
    all_data[column] = all_data[column].map(mean_values)
X_data = all_data.iloc[:, :-1]
y_data = all_data.iloc[:, -1]
X_data = pd.get_dummies(X_data, columns=['accident'], prefix='accident')
X_data.rename(columns={'accident_At least 1 accident or damage reported': 'accident1'}, inplace=True)
X_data.rename(columns={'accident_None reported': 'accident0'}, inplace=True)
X_data = X_data.astype(int)
scaler = StandardScaler()
X_data = pd.DataFrame(scaler.fit_transform(X_data), columns=X_data.columns)

In [3]:
X_train = X_data.head(188533)
y_train = y_data.head(188533)
X_test = X_data.tail(125690)

In [154]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
X_train = X_data.head(188533)
y_train = y_data.head(188533)
model.fit(X_train, y_train)
X_test = X_data.tail(125690)
y_pred = model.predict(X_test)
id = np.array([idx for idx in range(188533, 314223)])
prediction = pd.DataFrame({'id' : id, 'price' : y_pred})

In [None]:

def ReLU(x):
    return np.maximum(x, 0)

def forwardpass(x, W1, b1, W2, b2):
    Z1 = np.matmul(W1, x) + b1
    A1 = ReLU(Z1)
    Z2 = np.matmul(W2, A1) + b2
    return Z1, A1, Z2

def backprop(x, Z1, A1, W2, A2, y):
    dLdZ2 = A2
    dLdZ2[y] -= 1  # Subtract 1 from the correct class
    
    dLdW2 = np.matmul(dLdZ2, A1.T)
    dLdb2 = dLdZ2
    
    dLdA1 = np.matmul(W2.T, dLdZ2)
    dLdZ1 = dLdA1 * (Z1 > 0)  # Element-wise multiplication for ReLU derivative
    
    dLdW1 = np.matmul(dLdZ1, x.T)
    dLdb1 = dLdZ1

    return dLdb1, dLdW1, dLdb2, dLdW2

def gradient_descent(X, ys, W1, b1, W2, b2, num_iterations, alpha):  # alpha = learning rate!
    for _ in range(num_iterations):
        idx = np.random.randint(0, X.shape[1])
        x = X[:, idx:idx+1]  # Keep x as a column vector
        y = ys[idx]
        Z1, A1, Z2, A2 = forwardpass(x, W1, b1, W2, b2)
        dLdb1, dLdW1, dLdb2, dLdW2 = backprop(x, Z1, A1, W2, A2, y)
        b1 -= dLdb1 * alpha
        W1 -= dLdW1 * alpha
        b2 -= dLdb2 * alpha
        W2 -= dLdW2 * alpha
    return b1, W1, b2, W2

def gety(x, W1, b1, W2, b2):
    _, _, _, A2 = forwardpass(x, W1, b1, W2, b2)
    return np.argmax(A2, axis=0)

# Initialize weights and biases
b1 = np.random.normal(0, 1, size=(10, 1))
W1 = np.random.normal(0, 1, size=(10, 784))
b2 = np.random.normal(0, 1, size=(10, 1))
W2 = np.random.normal(0, 1, size=(10, 10))

# Train the model
b1, W1, b2, W2 = gradient_descent(X_train, y_train, W1, b1, W2, b2, num_iterations=10**6, alpha=0.02)

# Test the model
numcorrect = 0
totalnum = X_test.shape[1]
for idx in range(totalnum):
    x = X_test[:, idx:idx+1]  # Keep x as a column vector
    y = y_test[idx]
    if gety(x, W1, b1, W2, b2) == y:
        numcorrect += 1

print(f"Accuracy: {numcorrect / totalnum * 100:.2f}%")

In [156]:
id = np.array([idx for idx in range(188533, 314223)])
prediction = pd.DataFrame({'id' : id, 'price' : y_pred})

In [None]:
prediction.to_csv('09052024carsprediction2.csv', index = False)