# Homework 12

https://scikit-learn.org/0.15/modules/scaling_strategies.html#incremental-learning

* Implement a mini batch functionality to train a regressor.
    - (Optional) If anyone want to do this in a pipeline can do this: https://koaning.github.io/tokenwiser/api/pipeline.html

* Save model, load the model again and test it on `X_test` __Do NOT commit the pickle file__

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
def test_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)

    df = df.sample(5000, random_state=100).reset_index(drop=True)
    
    y = df['sellingprice']
    df.drop('sellingprice', axis=1, inplace=True)
    X = df
    
    return X,y

def partial_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)
   
    while(True):
        yield df.sample(100).reset_index(drop=True)
        
gen = partial_df()

In [3]:
X_test, y_test = test_df()

In [11]:
# each time you call this you will get a new slice of the dataframe.
next(gen)

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2014,Ford,Focus,SE,sedan,automatic,1fadp3f27el275870,tx,4.2,11319.0,red,black,bank of the west,12000,11100,Thu Jun 11 2015 04:00:00 GMT-0700 (PDT)
1,2013,Ford,Mustang,V6,Coupe,,1zvbp8am1d5267562,pa,3.4,62089.0,white,black,the hertz corporation,13900,13500,Fri Feb 06 2015 01:00:00 GMT-0800 (PST)
2,2005,Dodge,Dakota,ST,Club Cab,automatic,1d7he22k05s329460,az,3.1,110851.0,white,gray,aps rent a car & leasing,4650,5000,Thu Feb 12 2015 03:00:00 GMT-0800 (PST)
3,2007,Pontiac,G6,Base,Sedan,automatic,1g2zg58n574222816,nc,,129290.0,white,gray,jackson's auto mart inc,3800,5000,Mon Dec 22 2014 10:00:00 GMT-0800 (PST)
4,2008,Chevrolet,Colorado,LT,Crew Cab,automatic,1gccs139088161050,wa,2.6,48568.0,white,black,wells fargo dealer services,12900,11800,Wed Jan 14 2015 05:30:00 GMT-0800 (PST)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2005,Chrysler,Sebring,GTC,Convertible,automatic,1c3el75r55n602727,ca,2.8,141586.0,white,gray,north county hyundai,1250,1500,Wed Feb 04 2015 04:30:00 GMT-0800 (PST)
96,2012,Ford,Explorer,XLT,SUV,,1fmhk8d85cga06931,nj,3.2,40877.0,black,black,"ford motor credit company,llc",24100,22800,Wed Jan 14 2015 01:00:00 GMT-0800 (PST)
97,2012,Ford,Edge,SEL,SUV,automatic,2fmdk3jc8cba27418,ca,3.8,36386.0,black,black,"ford motor credit company,llc pd",19650,22300,Thu Mar 05 2015 04:30:00 GMT-0800 (PST)
98,2014,Nissan,Altima,2.5 S,Sedan,automatic,1n4al3ap0ec283228,il,3,33386.0,black,black,nissan north america inc.,13300,13300,Thu May 28 2015 03:00:00 GMT-0700 (PDT)


generator

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import SGDRegressor
import pickle

# created pipeline for preprocssing 
numerical_columns = ['year', 'odometer', 'mmr', "year_of_sale"]
categorical_columns = ['make', 'model', 'trim', 'body', 'transmission', 'state', 'condition', 'color', 'interior']


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SGDRegressor())
])

In [14]:
from sklearn.metrics import mean_squared_error
import pandas as pd

def preprocess_data(df):
    df['mmr'] = pd.to_numeric(df['mmr'], errors='coerce')
    df["year_of_sale"] = df['saledate'].str.split().str[3].astype(float)
    return df

def train_model(generator, pipeline, num_batches):
    for i in range(num_batches):
        batch = next(generator)
        X_batch = batch.drop('sellingprice', axis=1)
        X_batch = preprocess_data(X_batch)
        y_batch = batch['sellingprice']
        
        if i % 1000 == 1:
            predictions = pipeline.predict(X_batch)
            mse = mean_squared_error(predictions, y_batch)
            print(f"iter: {i}, mse: {mse}")
        
        if i == 0:
            pipeline.fit(X_batch, y_batch)
        else:
            preprocessed_X_batch = pipeline.named_steps['preprocessor'].transform(X_batch)
            pipeline.named_steps['regressor'].partial_fit(preprocessed_X_batch, y_batch)

    return pipeline

num_batches = 10000
trained_pipeline = train_model(gen, pipeline, num_batches)




iter: 1, mse: 5639403.776389187
iter: 1001, mse: 1695231.434333995
iter: 2001, mse: 3583291.1889450783
iter: 3001, mse: 1295823.943105204
iter: 4001, mse: 4733721.121986251
iter: 5001, mse: 1654886.6862616586
iter: 6001, mse: 1808989.7747339518
iter: 7001, mse: 1846746.5412570396
iter: 8001, mse: 2178279.746256203
iter: 9001, mse: 1771731.1749253783


In [16]:
from sklearn.metrics import mean_squared_error

with open('model.pkl', 'wb') as file:
    pickle.dump(trained_pipeline, file)

with open('model.pkl', 'rb') as file:
    loaded_pipeline = pickle.load(file)

X_test_processed = preprocess_data(X_test)
predictions = loaded_pipeline.predict(X_test_processed)

mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 2350264.0990094007
