In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

seed = 1967


In [2]:
Subdirs: list[str] = [
    "Digital_Music", "Magazine_Subscriptions", "Musical_Instruments", "Patio_Lawn_and_Garden", "Software"
]



In [3]:
import csv

X_train: list[float, float, float, float] = []
y_train: list[float] = []
X_test: list[float, float, float, float] = []
y_test: list[float] = []

rows: list[str] = []

for subdir in Subdirs:
    file_path = f"data/{subdir}/1967/results.csv"
    with open(file_path, 'r') as file:
        # Read the CSV file
        reader = csv.reader(file)
        header = next(reader)  # Read the header row
        # Create a new list to store the filtered rows
        filtered_rows = []
        for row in reader:
            rows.append(row)  # Append the row to the rows list
    file.close()
    type_index = header.index('type')
    # Create training and test lists.
    # Training and test data are distringuished by the 'type' column, 'tr' for training and 'te' for test.
    # For training data, put O-score,L-scoreP,L-scoreM,L-scoreN into X_train, Y into y_train
    # For test data, put O-score,L-scoreP,L-scoreM,L-scoreN into X_test, Y into y_test
    for row in rows:
        if row[type_index] == 'tr':
            X_train.append([float(row[header.index('O-score')]), float(row[header.index('L-scoreP')]), float(row[header.index('L-scoreM')]), float(row[header.index('L-scoreN')])])
            y_train.append(float(row[header.index('Y')]))
        elif row[type_index] == 'te':
            X_test.append([float(row[header.index('O-score')]), float(row[header.index('L-scoreP')]), float(row[header.index('L-scoreM')]), float(row[header.index('L-scoreN')])])
            y_test.append(float(row[header.index('Y')]))


In [4]:

# Train a Random Forest Regressor
model = RandomForestRegressor(random_state=seed)
model.fit(X=X_train, y=y_train)

# Make predictions on the test set
Y_pred = model.predict(X= X_test)

# Evaluate the model
mse = mean_squared_error(y_true= y_test, y_pred=Y_pred)
r2 = r2_score(y_test, Y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Squared Error: 0.63
R^2 Score: 0.72


In [10]:
modifiedRows: list[str] = []

for row in rows:
    # for 'te' rows, replace the Y-pred value with that obtained from the model
    if row[type_index] == 'te':
        Xsample = [float(row[header.index('O-score')]), float(row[header.index('L-scoreP')]), float(row[header.index('L-scoreM')]), float(row[header.index('L-scoreN')])]
        y_pred = model.predict([Xsample])
        # Replace the Y value with the predicted value
        row[header.index('Y-pred')] = f"{y_pred[0]:.2f}"
    # Append the modified row to the rows list
    modifiedRows.append(row)

# Write the contents of all files into a new CSV file
with open('data/overall_results.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(modifiedRows)
file.close()


In [6]:
import pickle

# Save the model to a file
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

print("Model saved successfully.")

Model saved successfully.
