### Libraries import

In [15]:
import pandas as pd 
import numpy as np 
import logging
from functions.data_prep import DataPreparation
from functions.prediction import PredictProductivityModel

### Step 1: Data Cleaning & Transformations

In [16]:
df = pd.read_csv('data/garments_worker_productivity.csv')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date                   1197 non-null   object 
 1   quarter                1197 non-null   object 
 2   department             1197 non-null   object 
 3   day                    1197 non-null   object 
 4   team                   1197 non-null   int64  
 5   targeted_productivity  1197 non-null   float64
 6   smv                    1197 non-null   float64
 7   wip                    691 non-null    float64
 8   over_time              1197 non-null   int64  
 9   incentive              1197 non-null   int64  
 10  idle_time              1197 non-null   float64
 11  idle_men               1197 non-null   int64  
 12  no_of_style_change     1197 non-null   int64  
 13  no_of_workers          1197 non-null   float64
 14  actual_productivity    1197 non-null   float64
dtypes: f

In [18]:
df_prep = DataPreparation(df=df)


In [19]:
missing_cols = df_prep.check_missing_data()
missing_cols.values

array(['wip'], dtype=object)

In [20]:
df_prep.check_missing_data().empty

False

In [21]:
df_imputed = df_prep.impute_missing_data(missing_cols=missing_cols, impute_option='zero')

In [22]:
df_transformed = df_prep.transform_data()

In [23]:
df_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   1197 non-null   datetime64[ns]
 1   quarter                1197 non-null   object        
 2   department             1197 non-null   object        
 3   day                    1197 non-null   int32         
 4   team                   1197 non-null   int64         
 5   targeted_productivity  1197 non-null   float64       
 6   smv                    1197 non-null   float64       
 7   wip                    1197 non-null   float64       
 8   over_time              1197 non-null   int64         
 9   incentive              1197 non-null   int64         
 10  idle_time              1197 non-null   float64       
 11  idle_men               1197 non-null   int64         
 12  no_of_style_change     1197 non-null   int64         
 13  no_

### Step 2: Predicting Productivity with XgBoost

Initializing an object of Predicting Model.

In [24]:
model = PredictProductivityModel()

Creating lags for a few features. 

In [25]:
model.create_lag_features(data=df_transformed, features = ['actual_productivity', 'targeted_productivity', 'over_time'], lags=2).info()

<class 'pandas.core.frame.DataFrame'>
Index: 1195 entries, 2 to 1196
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   date                         1195 non-null   datetime64[ns]
 1   quarter                      1195 non-null   object        
 2   department                   1195 non-null   object        
 3   day                          1195 non-null   int32         
 4   team                         1195 non-null   int64         
 5   targeted_productivity        1195 non-null   float64       
 6   smv                          1195 non-null   float64       
 7   wip                          1195 non-null   float64       
 8   over_time                    1195 non-null   int64         
 9   incentive                    1195 non-null   int64         
 10  idle_time                    1195 non-null   float64       
 11  idle_men                     1195 non-null   int

In [26]:
model.fit(data=df_transformed, features = ['actual_productivity', 'targeted_productivity', 'over_time'], 
lags=2, 
target='actual_productivity', 
split_date='3/04/2015', 
date_column='date')

Call `predict` method and it prepares the df with the column of predictions for test data period. 

In [27]:
model.predict()

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity,month,num_week,predicted
0,2015-01-01,Quarter1,sweing,1,8,0.80,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725,1,1,
1,2015-01-01,Quarter1,finishing,1,1,0.75,3.94,0.0,960,0,0.0,0,0,8.0,0.886500,1,1,
2,2015-01-01,Quarter1,sweing,1,11,0.80,11.41,968.0,3660,50,0.0,0,0,30.5,0.800570,1,1,
3,2015-01-01,Quarter1,sweing,1,12,0.80,11.41,968.0,3660,50,0.0,0,0,30.5,0.800570,1,1,
4,2015-01-01,Quarter1,sweing,1,6,0.80,25.90,1170.0,1920,50,0.0,0,0,56.0,0.800382,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,2015-03-11,Quarter2,finishing,11,10,0.75,2.90,0.0,960,0,0.0,0,0,8.0,0.628333,3,11,0.579836
1193,2015-03-11,Quarter2,finishing,11,8,0.70,3.90,0.0,960,0,0.0,0,0,8.0,0.625625,3,11,0.617247
1194,2015-03-11,Quarter2,finishing,11,7,0.65,3.90,0.0,960,0,0.0,0,0,8.0,0.625625,3,11,0.658261
1195,2015-03-11,Quarter2,finishing,11,9,0.75,2.90,0.0,1800,0,0.0,0,0,15.0,0.505889,3,11,0.723004
