In [1]:
import os
import warnings
import sys
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import plotly
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import random
from random import choices
from string import ascii_lowercase, digits
import datetime
from pathlib import Path
from functools import partial
from itertools import starmap
from dotenv import load_dotenv
import requests

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

In [2]:
this_dir = pathlib.Path()
parent_dir = this_dir.resolve().parent
data_dir = this_dir / "matmul-data"

# create results directory, if not already existing
cwd = os.getcwd()
results_dir = os.path.join(cwd, r'matmul_results')
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

# Implement a simple recommender algorithm as linear regression

# Data Prep

Matmul key:

0 = (1, 8)

1 = (2, 8)

2 = (3, 8)

3 = (4, 8)

4 = (8, 8)

In [3]:
hw_dict = {0 : (1,8), 1 : (2,8), 2 : (3,8), 3 : (4,8), 4 : (8,8)}

In [4]:
# Prepare data
def data_prep(filename):
# Read and log the input data 
    data_filepath = data_dir / filename
    data = pd.read_csv(data_filepath)    
    
    ## DATA PREPROCESSING STEPS ##
    # remove noisy features 
    df = data[['size', 'sparsity', 'min', 'max', 'hardware']]
    
    # Replace infty values with NaN
    df.replace(['inf', np.inf, -np.inf], np.nan, inplace=True)

    # Impute NaN values with mean
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    sample_data = df.sample(frac=0.25).dropna()
    imp_mean.fit(df)
    df = pd.DataFrame(imp_mean.fit_transform(df), columns = df.columns)
    
    # Encode categorical features
#     for col_name in feature_cols.columns.tolist():
#         df[col_name] = df[col_name].astype('category').cat.codes
    
    return df

## Create data sets

In [5]:
train1 = data_prep("train.csv")
test1 = data_prep("test.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace(['inf', np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace(['inf', np.inf, -np.inf], np.nan, inplace=True)


In [6]:
df_5k = data_prep("size_5k+.csv")
train5k, test5k = train_test_split(df_5k, random_state=0, shuffle=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace(['inf', np.inf, -np.inf], np.nan, inplace=True)


In [7]:
df_3k = data_prep("size_3k+.csv") 
train3k, test3k = train_test_split(df_3k, random_state=0, shuffle=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace(['inf', np.inf, -np.inf], np.nan, inplace=True)


# Training

In [8]:
## DEFINE MODEL TRAINING FUNCTIONS##

def train_recommender(train, test, seed=None):
    
    target_column = "hardware"
    
    # Split train dataset
    train_X = train.drop(columns=[target_column])  # Features
    train_y = train[target_column]                # Target
    
    # Split test dataset
    test_X = test.drop(columns=[target_column])  # Features
    test_y = test[target_column]                # Target
    
    
    # Execute recommender system
    model = LinearRegression()
    model.fit(train_X, train_y)
    
    predictions = model.predict(test_X)
    
    return train_X, test_X, test_y, predictions

# Eval and plotting functions

In [9]:
# MODEL EVALUATION FUNCTIONS #

def eval_metrics(actual, pred):
        mse = mean_squared_error(actual,pred)
        mae = mean_absolute_error(actual, pred)
        return {"mse": mse, "mae": mae}

# matmul train/test results

In [10]:
train_X, test_X, test_y, predictions = train_recommender(train1, test1)

In [11]:
metrics = eval_metrics(test_y, predictions)
print(metrics)

{'mse': 2.000278375442955, 'mae': 1.2009972425038318}


# matmul 5k+ results

In [12]:
train_X, test_X, test_y, predictions = train_recommender(train5k, test5k)

In [13]:
metrics = eval_metrics(test_y, predictions)
print(metrics)

{'mse': 6.144628099173556, 'mae': 2.454545454545455}


# matmul 3k+ results

In [14]:
train_X, test_X, test_y, predictions = train_recommender(train3k, test3k)

In [15]:
metrics = eval_metrics(test_y, predictions)
print(metrics)

{'mse': 6.177123932261412, 'mae': 2.461840679572031}


# matmul 5k/3k

In [16]:
train_X, test_X, test_y, predictions = train_recommender(df_5k, df_3k)
metrics = eval_metrics(test_y, predictions)
print(metrics)

{'mse': 2.0, 'mae': 1.2}


# matmul 5k+3k

In [17]:
df_combined = pd.concat([df_5k, df_3k])

In [18]:
# Combine and split 5k+/3k+ data sets
train2, test2 = train_test_split(df_combined, random_state=0, shuffle=True)

In [19]:
train_X, test_X, test_y, predictions = train_recommender(train2, test2)
metrics = eval_metrics(test_y, predictions)
print(metrics)

{'mse': 1.942692512815108, 'mae': 1.1911644246913389}
