# Homework 12

https://scikit-learn.org/0.15/modules/scaling_strategies.html#incremental-learning

* Implement a mini batch functionality to train a regressor.
    - (Optional) If anyone want to do this in a pipeline can do this: https://koaning.github.io/tokenwiser/api/pipeline.html

* Save model, load the model again and test it on `X_test` __Do NOT commit the pickle file__

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
def test_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)

    df = df.sample(5000, random_state=100).reset_index(drop=True)
    
    y = df['sellingprice']
    df.drop('sellingprice', axis=1, inplace=True)
    X = df
    
    return X,y

def partial_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)
   
    while(True):
       yield df.sample(100).reset_index(drop=True)
        
gen = partial_df()

In [3]:
X_test, y_test = test_df()

In [4]:
# each time you call this you will get a new slice of the dataframe.
next(gen)

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2004,Chevrolet,Silverado 1500,LS,Crew Cab,,2gcek13t341359803,tx,2.6,204698.0,white,tan,westlake financial services,6225,6500,Tue Jan 13 2015 10:30:00 GMT-0800 (PST)
1,2012,Chevrolet,Malibu,LTZ,Sedan,automatic,1g1zg5e76cf118019,nc,3.4,61736.0,silver,black,bmw alphera/alphera financial services,11600,10200,Tue Jan 27 2015 01:15:00 GMT-0800 (PST)
2,2014,Lexus,IS 250,Base,Sedan,automatic,jthbf1d29e5027120,fl,4.4,7961.0,gray,—,zimmerman auto brokers inc,29700,31400,Tue Feb 17 2015 01:30:00 GMT-0800 (PST)
3,2013,Toyota,Corolla,LE,Sedan,automatic,2t1bu4ee4dc059105,tn,2.8,42239.0,—,gray,toyota financial services/avis budget group-open,11500,11300,Wed Feb 11 2015 02:30:00 GMT-0800 (PST)
4,2008,Pontiac,Grand Prix,Base,Sedan,automatic,2g2wp552981149735,tx,1.9,87406.0,white,black,flexco fleet services,5550,3500,Tue Feb 03 2015 02:30:00 GMT-0800 (PST)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2013,Hyundai,Elantra,Limited,sedan,,5npdh4ae1dh300523,pa,4.9,8150.0,gray,gray,hyundai motor finance,14600,15400,Fri Jun 05 2015 02:00:00 GMT-0700 (PDT)
96,2009,Ford,F-250 Super Duty,Lariat,crew cab,automatic,1ftsw21559ea39793,az,2.4,56912.0,gray,black,precision toyota of tucson,26300,22000,Wed May 27 2015 06:00:00 GMT-0700 (PDT)
97,2013,Ford,F-150,XLT,SuperCrew,automatic,1ftfw1ef5dfb62849,oh,4.9,43787.0,black,gray,exceptional motorcar llc,23700,29200,Tue Mar 03 2015 01:30:00 GMT-0800 (PST)
98,2008,Dodge,Ram Pickup 1500,ST,Quad Cab,,1d7hu18n08s516362,ga,2.7,94380.0,silver,gray,santander consumer,13550,13200,Tue Feb 24 2015 01:30:00 GMT-0800 (PST)


In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)
 #Remove unnecessary columns
df = df.drop(['vin', 'saledate', 'seller'], axis=1)

In [6]:
# Drop rows with NaN values
df = df.dropna()

In [7]:
import pandas as pd

def preprocess_data(df):
    # Remove rows with missing values
    df = df.dropna()

    # Cast 'mmr' column to string type, and drop rows with non-numeric values
    df['mmr'] = df['mmr'].astype(str)
    df = df[df['mmr'].str.isnumeric()]

    # Handle 'mmr' column
    mmr_mean = df['mmr'].astype(float).mean()
    df['mmr'] = df['mmr'].fillna(mmr_mean)  # replace missing values with mean
    df['mmr'] = df['mmr'].str.extract('(\d+)', expand=False)  # extract numeric part
    df['mmr'] = df['mmr'].astype(float)  # convert to float

    # Handle non-numeric values in the 'make', 'model', 'trim', 'body', 'transmission', 'state', 'color', and 'interior' columns
    df['make'] = pd.factorize(df['make'])[0]
    df['model'] = pd.factorize(df['model'])[0]
    df['trim'] = pd.factorize(df['trim'])[0]
    df['body'] = pd.factorize(df['body'])[0]
    df['transmission'] = pd.factorize(df['transmission'])[0]
    df['state'] = pd.factorize(df['state'])[0]
    df['color'] = pd.factorize(df['color'])[0]
    df['interior'] = pd.factorize(df['interior'])[0]

    # Perform one-hot encoding on categorical columns
    df = pd.get_dummies(df, drop_first=True)  # 'drop_first' to avoid dummy variable trap

    return df.reset_index(drop=True)  # reset the index to avoid issues with the batching function


In [8]:
def prepare_batches(df, batch_size):
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    n_batches = int(np.ceil(len(df) / batch_size))
    for i in range(n_batches):
        batch_df = df.iloc[i*batch_size:(i+1)*batch_size]
        X_batch = batch_df.drop(['sellingprice'], axis=1).values
        y_batch = batch_df['sellingprice'].values

        # Fit and transform the data
        scaler_X.partial_fit(X_batch)
        X_batch_scaled = scaler_X.transform(X_batch)

        scaler_y.partial_fit(y_batch.reshape(-1, 1))
        y_batch_scaled = scaler_y.transform(y_batch.reshape(-1, 1)).ravel()

        yield X_batch_scaled, y_batch_scaled


In [9]:
def train_regressor(X_train, y_train, batch_size, n_epochs, scaler_X=None, scaler_y=None):
    if scaler_X is None:
        scaler_X = StandardScaler()
        scaler_X.fit(X_train)
        
    X_train_scaled = scaler_X.transform(X_train)
        
    if scaler_y is None:
        scaler_y = StandardScaler()
        y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
    else:
        y_train_scaled = scaler_y.transform(y_train.values.reshape(-1, 1)).flatten()
    
    regressor = SGDRegressor(alpha=0.1, random_state=0)
    
    for epoch in range(n_epochs):
        print(f'Epoch {epoch + 1}')
        for X_batch_scaled, y_batch_scaled in prepare_batches(X_train_scaled, y_train_scaled, batch_size, scaler_y=scaler_y):
            regressor.partial_fit(X_batch_scaled, y_batch_scaled)
            
    return regressor, scaler_X, scaler_y


In [10]:
def test_regressor(X_test, y_test, regressor, scaler_X, scaler_y):
    # Scale the test data
    X_test_processed = preprocess_data(X_test)  # preprocess the test data
    X_test_scaled = scaler_X.transform(X_test_processed)

    y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).ravel()

    # Test the model
    y_pred_scaled = regressor.predict(X_test_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # Calculate the mean squared error
    mse = ((y_test - y_pred)**2).mean()

    return mse

In [None]:
gen = partial_df()
# Get the test data
X_test, y_test = test_df()
# Preprocess the data
X_test = preprocess_data(X_test)
batch_size = 100
n_epochs = 10
y_train = y_test  # define y_train using y_test
regressor, scaler_X, scaler_y = train_regressor(X_train=preprocess_data(next(gen)).reset_index(drop=True), y_train=y_train, batch_size=batch_size, n_epochs=n_epochs)
mse = test_regressor(X_test, y_test, regressor, scaler_X, scaler_y)
print(f'Test MSE: {mse:.2f}')