In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('/content/insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
lb = LabelEncoder()

In [7]:
df['sex']=lb.fit_transform(df['sex'])
df['smoker']=lb.fit_transform(df['smoker'])
df['region']=lb.fit_transform(df['region'])

In [8]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [9]:
x = df.drop(columns=['charges'])
y=df['charges']

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,
                                                       random_state=42)

In [12]:
from sklearn.tree import DecisionTreeRegressor

In [13]:
dt = DecisionTreeRegressor()

In [14]:
dt.fit(x_train , y_train)

In [15]:
y_pred = dt.predict(x_test)

In [16]:
from sklearn.metrics import r2_score

In [17]:
r2_score(y_test , y_pred)

0.7216173116340096

In [18]:
param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [21]:
from sklearn.model_selection import GridSearchCV

In [23]:
dtree_reg = DecisionTreeRegressor(random_state=42) # Initialize a decision tree regressor
grid_search = GridSearchCV(estimator=dtree_reg, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(x_train, y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [29]:
best_dtree_reg = grid_search.best_estimator_
best_dtree_reg

In [26]:
from sklearn.metrics import mean_squared_error

In [27]:
y_pred = best_dtree_reg.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")
print(f"Test RMSE: {rmse}")

Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Test RMSE: 5127.997027573132


In [28]:
r2_score(y_test,y_pred)

0.8306179170540128

In [30]:
df = pd.read_csv('/content/supply_chain.csv')
df.head()

Unnamed: 0,Product type,SKU,Price,Availability,Number of products sold,Revenue generated,Customer demographics,Stock levels,Lead times,Order quantities,...,Location,Lead time,Production volumes,Manufacturing lead time,Manufacturing costs,Inspection results,Defect rates,Transportation modes,Routes,Costs
0,haircare,SKU0,69.808006,55,802,8661.996792,Non-binary,58,7,96,...,Mumbai,29,215,29,46.279879,Pending,0.22641,Road,Route B,187.752075
1,skincare,SKU1,14.843523,95,736,7460.900065,Female,53,30,37,...,Mumbai,23,517,30,33.616769,Pending,4.854068,Road,Route B,503.065579
2,haircare,SKU2,11.319683,34,8,9577.749626,Unknown,1,10,88,...,Mumbai,12,971,27,30.688019,Pending,4.580593,Air,Route C,141.920282
3,skincare,SKU3,61.163343,68,83,7766.836426,Non-binary,23,13,59,...,Kolkata,24,937,18,35.624741,Fail,4.746649,Rail,Route A,254.776159
4,skincare,SKU4,4.805496,26,871,2686.505152,Non-binary,5,3,56,...,Delhi,5,414,3,92.065161,Fail,3.14558,Air,Route A,923.440632


In [31]:
df.isnull().sum()

Unnamed: 0,0
Product type,0
SKU,0
Price,0
Availability,0
Number of products sold,0
Revenue generated,0
Customer demographics,0
Stock levels,0
Lead times,0
Order quantities,0


In [32]:
df.sample()

Unnamed: 0,Product type,SKU,Price,Availability,Number of products sold,Revenue generated,Customer demographics,Stock levels,Lead times,Order quantities,...,Location,Lead time,Production volumes,Manufacturing lead time,Manufacturing costs,Inspection results,Defect rates,Transportation modes,Routes,Costs
76,haircare,SKU76,69.1088,23,241,5328.375984,Male,38,1,22,...,Bangalore,25,985,24,64.323598,Pending,2.180037,Rail,Route A,997.41345


In [36]:
df.dtypes

Unnamed: 0,0
Product type,object
SKU,object
Price,float64
Availability,int64
Number of products sold,int64
Revenue generated,float64
Customer demographics,object
Stock levels,int64
Lead times,int64
Order quantities,int64


In [33]:
from sklearn.preprocessing import LabelEncoder

In [34]:
lb = LabelEncoder()

In [38]:
categorical = []

for i in df.select_dtypes(include = "object").columns:
    categorical.append(i)
for j in categorical:
    df[j] = lb.fit_transform(df[j])



In [39]:
df.head()

Unnamed: 0,Product type,SKU,Price,Availability,Number of products sold,Revenue generated,Customer demographics,Stock levels,Lead times,Order quantities,...,Location,Lead time,Production volumes,Manufacturing lead time,Manufacturing costs,Inspection results,Defect rates,Transportation modes,Routes,Costs
0,1,0,69.808006,55,802,8661.996792,2,58,7,96,...,4,29,215,29,46.279879,2,0.22641,2,1,187.752075
1,2,1,14.843523,95,736,7460.900065,0,53,30,37,...,4,23,517,30,33.616769,2,4.854068,2,1,503.065579
2,1,12,11.319683,34,8,9577.749626,3,1,10,88,...,4,12,971,27,30.688019,2,4.580593,0,2,141.920282
3,2,23,61.163343,68,83,7766.836426,2,23,13,59,...,3,24,937,18,35.624741,0,4.746649,1,0,254.776159
4,2,34,4.805496,26,871,2686.505152,2,5,3,56,...,2,5,414,3,92.065161,0,3.14558,0,0,923.440632


In [40]:
x = df.drop(columns=['Costs'])
y = df['Costs']

In [41]:
from sklearn.model_selection import train_test_split

In [42]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,
                                                       random_state=42)

In [43]:
from sklearn.tree import DecisionTreeRegressor

In [44]:
dt = DecisionTreeRegressor()

In [45]:
dt.fit(x_train , y_train)

In [46]:
y_pred = dt.predict(x_test)

In [47]:
from sklearn.metrics import r2_score

In [48]:
r2_score(y_test,y_pred)

-2.079410609539074

In [49]:
param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [50]:
from sklearn.model_selection import GridSearchCV

In [51]:
dtree_reg = DecisionTreeRegressor(random_state=42) # Initialize a decision tree regressor
grid_search = GridSearchCV(estimator=dtree_reg, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [52]:
best_dtree_reg = grid_search.best_estimator_
best_dtree_reg

In [53]:
from sklearn.metrics import mean_squared_error

In [54]:
y_pred = best_dtree_reg.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")
print(f"Test RMSE: {rmse}")

Best parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Test RMSE: 436.3426762303971


In [55]:
r2_score(y_test,y_pred)

-1.9212367571500653