**Random** **Forest** **Regression**

In [15]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [16]:
# Load the dataset
df= pd.read_csv('business.retailsales.csv')

In [17]:
df.head()

Unnamed: 0,Product Type,Net Quantity,Gross Sales,Discounts,Returns,Total Net Sales
0,Art & Sculpture,34,14935.0,-594.0,-1609.0,12732.0
1,Basket,13,3744.0,-316.8,0.0,3427.2
2,Basket,12,3825.0,-201.6,-288.0,3335.4
3,Basket,17,3035.0,-63.25,0.0,2971.75
4,Art & Sculpture,47,2696.8,-44.16,0.0,2652.64


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1775 entries, 0 to 1774
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Product Type     1767 non-null   object 
 1   Net Quantity     1775 non-null   int64  
 2   Gross Sales      1775 non-null   float64
 3   Discounts        1775 non-null   float64
 4   Returns          1775 non-null   float64
 5   Total Net Sales  1775 non-null   float64
dtypes: float64(4), int64(1), object(1)
memory usage: 83.3+ KB


In [19]:
df.isnull().sum()

Product Type       8
Net Quantity       0
Gross Sales        0
Discounts          0
Returns            0
Total Net Sales    0
dtype: int64

In [20]:
# Handling missing values
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = df.select_dtypes(include="object").columns

In [21]:
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

In [22]:
# Converting object columns to numerical using OneHotEncoder
encoder = OneHotEncoder()
encoded_cols = encoder.fit_transform(df[categorical_cols]).toarray()

In [23]:
feature_names = encoder.get_feature_names_out(categorical_cols)

In [24]:
df= pd.concat([df[numerical_cols], pd.DataFrame(encoded_cols, columns=feature_names)], axis=1)

In [25]:
df.isnull().sum()

Net Quantity                     0
Gross Sales                      0
Discounts                        0
Returns                          0
Total Net Sales                  0
Product Type_Accessories         0
Product Type_Art & Sculpture     0
Product Type_Basket              0
Product Type_Christmas           0
Product Type_Easter              0
Product Type_Fair Trade Gifts    0
Product Type_Furniture           0
Product Type_Gift Baskets        0
Product Type_Home Decor          0
Product Type_Jewelry             0
Product Type_Kids                0
Product Type_Kitchen             0
Product Type_Music               0
Product Type_One-of-a-Kind       0
Product Type_Recycled Art        0
Product Type_Skin Care           0
Product Type_Soapstone           0
Product Type_Textiles            0
dtype: int64

In [26]:
# Splitting the data
x = df.iloc[:, 1:-1].values
y = df.iloc[:, -1].values

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [28]:
# Training the Random Forest Regression model on the whole dataset
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(x, y)

In [29]:
# Create a new input data point with 21 features
input_data = [[6.5, 88, 70, 0.842, 43, 0.37, 21, 31, 0.534, 0.654] + [0] * 11]

# Predict the result
prediction = regressor.predict(input_data)[0]

# Print the prediction
print(f"Predicted value: {prediction}")

Predicted value: 0.1


In [31]:
# Predicting the Test set results
y_pred = regressor.predict(x_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.1 0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.1 0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.1 0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]
 [0.1 0. ]
 [0.  0. ]
 [0.  0. ]
 [0.  0. ]

In [32]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7938385269121813