In [3]:
# Basic libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# To show plots inline
%matplotlib inline


In [4]:
# Load the dataset
df = pd.read_csv('garments_worker_productivity.csv')

# Display first 5 rows
df.head()


Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,1/1/2015,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
1,1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865
2,1/1/2015,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,1/1/2015,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,1/1/2015,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382


In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date                   1197 non-null   object 
 1   quarter                1197 non-null   object 
 2   department             1197 non-null   object 
 3   day                    1197 non-null   object 
 4   team                   1197 non-null   int64  
 5   targeted_productivity  1197 non-null   float64
 6   smv                    1197 non-null   float64
 7   wip                    691 non-null    float64
 8   over_time              1197 non-null   int64  
 9   incentive              1197 non-null   int64  
 10  idle_time              1197 non-null   float64
 11  idle_men               1197 non-null   int64  
 12  no_of_style_change     1197 non-null   int64  
 13  no_of_workers          1197 non-null   float64
 14  actual_productivity    1197 non-null   float64
dtypes: f

In [6]:
# Gives statistical summary for numeric columns
df.describe()


Unnamed: 0,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
count,1197.0,1197.0,1197.0,691.0,1197.0,1197.0,1197.0,1197.0,1197.0,1197.0,1197.0
mean,6.426901,0.729632,15.062172,1190.465991,4567.460317,38.210526,0.730159,0.369256,0.150376,34.609858,0.735091
std,3.463963,0.097891,10.943219,1837.455001,3348.823563,160.182643,12.709757,3.268987,0.427848,22.197687,0.174488
min,1.0,0.07,2.9,7.0,0.0,0.0,0.0,0.0,0.0,2.0,0.233705
25%,3.0,0.7,3.94,774.5,1440.0,0.0,0.0,0.0,0.0,9.0,0.650307
50%,6.0,0.75,15.26,1039.0,3960.0,0.0,0.0,0.0,0.0,34.0,0.773333
75%,9.0,0.8,24.26,1252.5,6960.0,50.0,0.0,0.0,0.0,57.0,0.850253
max,12.0,0.8,54.56,23122.0,25920.0,3600.0,300.0,45.0,2.0,89.0,1.120437


In [7]:
# Count missing values in each column
df.isnull().sum()


date                       0
quarter                    0
department                 0
day                        0
team                       0
targeted_productivity      0
smv                        0
wip                      506
over_time                  0
incentive                  0
idle_time                  0
idle_men                   0
no_of_style_change         0
no_of_workers              0
actual_productivity        0
dtype: int64

In [8]:
# Show how many rows belong to each department
df['department'].value_counts()


department
sweing        691
finishing     257
finishing     249
Name: count, dtype: int64

In [7]:
# Show all unique day values
df['day'].unique()

array(['Thursday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday'],
      dtype=object)

In [8]:
df.corr(numeric_only=True)['actual_productivity'].sort_values(ascending=False)


actual_productivity      1.000000
targeted_productivity    0.421594
wip                      0.131147
incentive                0.076538
over_time               -0.054206
no_of_workers           -0.057991
idle_time               -0.080851
smv                     -0.122089
team                    -0.148753
idle_men                -0.181734
no_of_style_change      -0.207366
Name: actual_productivity, dtype: float64

In [23]:
df['wip'] = df['wip'].fillna(df['wip'].mean())


In [22]:
df.isnull().sum()

team                     0
targeted_productivity    0
smv                      0
wip                      0
over_time                0
incentive                0
idle_time                0
idle_men                 0
no_of_style_change       0
no_of_workers            0
actual_productivity      0
quarter_Quarter2         0
quarter_Quarter3         0
quarter_Quarter4         0
quarter_Quarter5         0
department_finishing     0
department_sweing        0
day_Saturday             0
day_Sunday               0
day_Thursday             0
day_Tuesday              0
day_Wednesday            0
dtype: int64

In [21]:
df = pd.get_dummies(df, columns=['quarter', 'department', 'day'], drop_first=True)

KeyError: "None of [Index(['quarter', 'department', 'day'], dtype='object')] are in the [columns]"

In [20]:
df = df.drop('date', axis=1)


KeyError: "['date'] not found in axis"

In [19]:
# Drop columns that are not useful
df = df.drop(['date', 'wip'], axis=1)  # Drop 'wip' and 'date'

# Encode categorical columns using one-hot encoding
df = pd.get_dummies(df, columns=['quarter', 'department', 'day'], drop_first=True)

# Split into input features and target
X = df.drop('actual_productivity', axis=1)
y = df['actual_productivity']

# Split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


KeyError: "['date'] not found in axis"

In [12]:
# Splitting the dataset into input features and target variable
X = df.drop('actual_productivity', axis=1)
y = df['actual_productivity']


In [13]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df.drop('actual_productivity', axis=1)
y = df['actual_productivity']

# Split the data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
pred_test_lr = model_lr.predict(X_test)

mae = mean_absolute_error(y_test, pred_test_lr)
mse = mean_squared_error(y_test, pred_test_lr)
r2 = r2_score(y_test, pred_test_lr)

print("MAE:", mae)
print("MSE:", mse)
print("R² Score:", r2)


ValueError: could not convert string to float: '3/11/2015'

In [24]:
df.columns


Index(['team', 'targeted_productivity', 'smv', 'wip', 'over_time', 'incentive',
       'idle_time', 'idle_men', 'no_of_style_change', 'no_of_workers',
       'actual_productivity', 'quarter_Quarter2', 'quarter_Quarter3',
       'quarter_Quarter4', 'quarter_Quarter5', 'department_finishing ',
       'department_sweing', 'day_Saturday', 'day_Sunday', 'day_Thursday',
       'day_Tuesday', 'day_Wednesday'],
      dtype='object')

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Split features (X) and target (y)
X = df.drop('actual_productivity', axis=1)
y = df['actual_productivity']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# Predict
pred_test_lr = model_lr.predict(X_test)

# Evaluation
mae = mean_absolute_error(y_test, pred_test_lr)
mse = mean_squared_error(y_test, pred_test_lr)
r2 = r2_score(y_test, pred_test_lr)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)


Mean Absolute Error: 0.10845261337976426
Mean Squared Error: 0.02208718451910322
R^2 Score: 0.16816825663050794


In [26]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize the XGBoost regressor
model_xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
model_xgb.fit(X_train, y_train)

# Predict on test data
pred_test_xgb = model_xgb.predict(X_test)

# Evaluate the model
mae_xgb = mean_absolute_error(y_test, pred_test_xgb)
mse_xgb = mean_squared_error(y_test, pred_test_xgb)
r2_xgb = r2_score(y_test, pred_test_xgb)

# Show the results
print("📌 XGBoost Results")
print("Mean Absolute Error:", mae_xgb)
print("Mean Squared Error:", mse_xgb)
print("R² Score:", r2_xgb)


📌 XGBoost Results
Mean Absolute Error: 0.07382100874468538
Mean Squared Error: 0.013982853009167361
R² Score: 0.47338779255296004


In [27]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize the model
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model_rf.fit(X_train, y_train)

# Predict
pred_test_rf = model_rf.predict(X_test)

# Evaluate
mae_rf = mean_absolute_error(y_test, pred_test_rf)
mse_rf = mean_squared_error(y_test, pred_test_rf)
r2_rf = r2_score(y_test, pred_test_rf)

print("🌲 Random Forest Results")
print("Mean Absolute Error:", mae_rf)
print("Mean Squared Error:", mse_rf)
print("R² Score:", r2_rf)


🌲 Random Forest Results
Mean Absolute Error: 0.07279212722966663
Mean Squared Error: 0.013961219628801836
R² Score: 0.4742025334489247


In [28]:
import pickle

# Save the best-performing model
with open('gwp.pkl', 'wb') as f:
    pickle.dump(model_rf, f)


In [30]:
features = ['team', 'targeted_productivity', 'smv', 'wip', 'over_time',
            'incentive', 'idle_time', 'idle_men', 'no_of_style_change', 'no_of_workers']
X = df[features]
y = df['actual_productivity']

# Handle missing values (e.g., wip has NaNs)
X = X.fillna(0)


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)

# Save model to a file
import joblib
joblib.dump(model_rf, 'model_rf_10features.pkl')


['model_rf_10features.pkl']