<a href="https://colab.research.google.com/github/jakecho1108/Project1/blob/main/Project_1_Part_5%2B6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Modeling & preprocessing import
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer,make_column_transformer,make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('/content/sales_predictions.csv')
df.dropna(axis=1, inplace=True) 
df['Item_Fat_Content']=df['Item_Fat_Content'].replace('LF','Low Fat')
df['Item_Fat_Content']=df['Item_Fat_Content'].replace('low fat','Low Fat')
df['Item_Fat_Content']=df['Item_Fat_Content'].replace('reg','Regular')
df=df.drop(columns = 'Item_Identifier')
df.head()

Unnamed: 0,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Tier 1,Supermarket Type1,3735.138
1,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Tier 3,Supermarket Type2,443.4228
2,Low Fat,0.01676,Meat,141.618,OUT049,1999,Tier 1,Supermarket Type1,2097.27
3,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Tier 3,Grocery Store,732.38
4,Low Fat,0.0,Household,53.8614,OUT013,1987,Tier 3,Supermarket Type1,994.7052


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Fat_Content           8523 non-null   object 
 1   Item_Visibility            8523 non-null   float64
 2   Item_Type                  8523 non-null   object 
 3   Item_MRP                   8523 non-null   float64
 4   Outlet_Identifier          8523 non-null   object 
 5   Outlet_Establishment_Year  8523 non-null   int64  
 6   Outlet_Location_Type       8523 non-null   object 
 7   Outlet_Type                8523 non-null   object 
 8   Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(3), int64(1), object(5)
memory usage: 599.4+ KB


In [4]:
ItemFat = {'Low Fat':0,
           'Regular' : 1}
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace(ItemFat)
df['Item_Fat_Content'].value_counts()

0    5517
1    3006
Name: Item_Fat_Content, dtype: int64

In [5]:
LocationTypes = {'Tier 1':0,
           'Tier 2' : 1,
           'Tier 3' : 2}
df['Outlet_Location_Type'] = df['Outlet_Location_Type'].replace(LocationTypes)
df['Outlet_Location_Type'].value_counts()

2    3350
1    2785
0    2388
Name: Outlet_Location_Type, dtype: int64

In [6]:
target = 'Item_Outlet_Sales'

X = df.drop(columns=target).copy()
y = df[target].copy()
X.head()

Unnamed: 0,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type
0,0,0.016047,Dairy,249.8092,OUT049,1999,0,Supermarket Type1
1,1,0.019278,Soft Drinks,48.2692,OUT018,2009,2,Supermarket Type2
2,0,0.01676,Meat,141.618,OUT049,1999,0,Supermarket Type1
3,1,0.0,Fruits and Vegetables,182.095,OUT010,1998,2,Grocery Store
4,0,0.0,Household,53.8614,OUT013,1987,2,Supermarket Type1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [8]:
cat_selector = make_column_selector(dtype_include = 'object')
cat_selector(X_train)

['Item_Type', 'Outlet_Identifier', 'Outlet_Type']

In [9]:
impute_cat = SimpleImputer(strategy='most_frequent')
encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)

cat_pipe = make_pipeline(impute_cat,encoder)
cat_pipe


In [10]:
cat_pipe.fit_transform(X_train[cat_selector(X_train)])



array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [11]:
num_selector = make_column_selector(dtype_include='number')
num_selector(X_train)

['Item_Fat_Content',
 'Item_Visibility',
 'Item_MRP',
 'Outlet_Establishment_Year',
 'Outlet_Location_Type']

In [12]:
scaler = StandardScaler()
scaler.fit_transform(X_train[num_selector(X_train)])

array([[-0.7403206 , -0.71277507,  1.82810922,  1.32784893,  1.08494779],
       [ 1.35076614, -1.29105225,  0.60336888,  1.32784893,  1.08494779],
       [ 1.35076614,  1.81331864,  0.24454056,  0.13618724, -1.38477667],
       ...,
       [-0.7403206 , -0.92052713,  1.52302674,  0.49368575, -0.14991444],
       [-0.7403206 , -0.2277552 , -0.38377708,  1.0895166 , -0.14991444],
       [-0.7403206 , -0.95867683, -0.73836105, -0.10214509, -1.38477667]])

In [13]:
preprocessor = make_column_transformer((cat_pipe,cat_selector),
                                       (scaler,num_selector))
preprocessor

In [14]:
preprocessor.fit_transform(X_train)



array([[ 0.        ,  0.        ,  0.        , ...,  1.82810922,
         1.32784893,  1.08494779],
       [ 0.        ,  0.        ,  0.        , ...,  0.60336888,
         1.32784893,  1.08494779],
       [ 0.        ,  0.        ,  0.        , ...,  0.24454056,
         0.13618724, -1.38477667],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.52302674,
         0.49368575, -0.14991444],
       [ 0.        ,  0.        ,  0.        , ..., -0.38377708,
         1.0895166 , -0.14991444],
       [ 0.        ,  0.        ,  0.        , ..., -0.73836105,
        -0.10214509, -1.38477667]])

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [16]:
linreg_pipe = make_pipeline(preprocessor,LinearRegression())
linreg_pipe.fit(X_train, y_train)



In [17]:
y_hat_train = linreg_pipe.predict(X_train)
y_hat_test = linreg_pipe.predict(X_test)

In [18]:
def evaluate_model(y_true, y_pred, split='training'):
  """ prints RMSE, and R2 metrics, include which data split was evaluated
  
  Args:
    y_true: y-train or y-test
    y_pred: result of model.predict(X)
    split: which data split is being evaluate ['training','test']
  """
  
  r2 = r2_score(y_true,y_pred)
  mae = mean_absolute_error(y_true,y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true,y_pred,squared=False)

  
  print(f'Results for {split} data:')
  print(f"  - R^2 = {round(r2,3)}")
  print(f"  - MAE = {round(mae,3)}")
  print(f"  - MSE = {round(mse,3)}")
  print(f"  - RMSE = {round(rmse,3)}")
  print()

In [19]:
evaluate_model(y_train, y_hat_train,split='Linear Regression training')
evaluate_model(y_test, y_hat_test,split='Linear Regression testing')

Results for Linear Regression training data:
  - R^2 = 0.56
  - MAE = 847.154
  - MSE = 1301055.422
  - RMSE = 1140.638

Results for Linear Regression testing data:
  - R^2 = 0.566
  - MAE = 803.616
  - MSE = 1196985.954
  - RMSE = 1094.069



In [20]:
from sklearn.tree import DecisionTreeRegressor

In [21]:
dec_tree_pipe = make_pipeline(preprocessor,DecisionTreeRegressor(random_state = 42))
dec_tree_pipe.fit(X_train, y_train)

## Get predictions for training and test data
y_hat_train_dec = dec_tree_pipe.predict(X_train)
y_hat_test_dec = dec_tree_pipe.predict(X_test)



In [22]:
evaluate_model(y_train, y_hat_train_dec,split='Decision Tree training')
evaluate_model(y_test, y_hat_test_dec,split='Decision Tree testing')

Results for Decision Tree training data:
  - R^2 = 1.0
  - MAE = 0.0
  - MSE = 0.0
  - RMSE = 0.0

Results for Decision Tree testing data:
  - R^2 = 0.216
  - MAE = 1028.277
  - MSE = 2162786.358
  - RMSE = 1470.641



Linear Regression is a better model due to larger R2 and smaller values for MAE, MSE and RMSE. 

I understand this project is incomplete as of now. I'll resubmit by next week. 