In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
# Custom decision stump implementation (a simple split on one feature)
class DecisionStump:
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.left_value = None
        self.right_value = None

    def fit(self, X, y):
        m, n = X.shape
        best_mse = float('inf')

        # Loop over all features and find the best split
        for feature_index in range(n):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_indices = X[:, feature_index] <= threshold
                right_indices = X[:, feature_index] > threshold
                
                left_mean = np.mean(y[left_indices]) if len(y[left_indices]) > 0 else 0
                right_mean = np.mean(y[right_indices]) if len(y[right_indices]) > 0 else 0
                
                # Predict the mean for each region
                left_pred = np.full(sum(left_indices), left_mean)
                right_pred = np.full(sum(right_indices), right_mean)
                
                predictions = np.concatenate([left_pred, right_pred])
                residuals = np.concatenate([y[left_indices], y[right_indices]]) - predictions
                
                mse = np.mean(residuals ** 2)

                if mse < best_mse:
                    best_mse = mse
                    self.feature_index = feature_index
                    self.threshold = threshold
                    self.left_value = left_mean
                    self.right_value = right_mean

    def predict(self, X):
        predictions = np.where(X[:, self.feature_index] <= self.threshold, self.left_value, self.right_value)
        return predictions


# Custom Gradient Boosting class
class CustomGradientBoosting:
    def __init__(self, n_estimators=100, learning_rate=0.1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.stumps = []

    def fit(self, X, y):
        m = len(y)
        # Initialize predictions with the mean
        initial_prediction = np.mean(y)
        self.initial_prediction = initial_prediction
        predictions = np.full(m, initial_prediction)
        
        # Train decision stumps on residuals
        for _ in range(self.n_estimators):
            residuals = y - predictions  # Calculate residuals
            
            stump = DecisionStump()
            stump.fit(X, residuals)  # Fit the stump to the residuals
            self.stumps.append(stump)

            stump_predictions = stump.predict(X)
            predictions += self.learning_rate * stump_predictions  # Update predictions

    def predict(self, X):
        m = X.shape[0]
        predictions = np.full(m, self.initial_prediction)

        # Add predictions from each stump
        for stump in self.stumps:
            predictions += self.learning_rate * stump.predict(X)

        return predictions




In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
train_data['Item_Weight'] = train_data['Item_Weight'].fillna(train_data['Item_Weight'].mean())
train_data['Outlet_Size'] = train_data['Outlet_Size'].fillna("Unknown")

test_data['Item_Weight'] = test_data['Item_Weight'].fillna(test_data['Item_Weight'].mean())
test_data['Outlet_Size'] = test_data['Outlet_Size'].fillna("Unknown")

train_data['Item_Fat_Content'] = train_data['Item_Fat_Content'].replace({
    'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'
})
test_data['Item_Fat_Content'] = test_data['Item_Fat_Content'].replace({
    'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'
})



In [5]:
label_encoder = LabelEncoder()

In [6]:
train_data['Item_Type'] = label_encoder.fit_transform(train_data['Item_Type'])
train_data['Outlet_Identifier'] = label_encoder.fit_transform(train_data['Outlet_Identifier'])
train_data['Outlet_Location_Type'] = label_encoder.fit_transform(train_data['Outlet_Location_Type'])
train_data['Outlet_Type'] = label_encoder.fit_transform(train_data['Outlet_Type'])
train_data['Item_Fat_Content'] = label_encoder.fit_transform(train_data['Item_Fat_Content'])
train_data['Outlet_Size'] = label_encoder.fit_transform(train_data['Outlet_Size'])


In [7]:
# Encoding categorical features in training data
test_data['Item_Type'] = label_encoder.fit_transform(test_data['Item_Type'])
test_data['Outlet_Identifier'] = label_encoder.fit_transform(test_data['Outlet_Identifier'])
test_data['Outlet_Location_Type'] = label_encoder.fit_transform(test_data['Outlet_Location_Type'])
test_data['Outlet_Type'] = label_encoder.fit_transform(test_data['Outlet_Type'])
test_data['Item_Fat_Content'] = label_encoder.fit_transform(test_data['Item_Fat_Content'])
test_data['Outlet_Size'] = label_encoder.fit_transform(test_data['Outlet_Size'])

In [8]:
X_train = train_data[['Item_Type', 'Item_MRP', 'Outlet_Identifier', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Fat_Content', 'Outlet_Size']].values
y_train = train_data['Item_Outlet_Sales'].values


In [9]:
X_test = test_data[['Item_Type', 'Item_MRP', 'Outlet_Identifier', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Fat_Content', 'Outlet_Size']].values

In [17]:
custom_gb = CustomGradientBoosting(n_estimators=2000, learning_rate=0.01)

In [18]:
custom_gb.fit(X_train, y_train)

In [16]:
y_train_pred = custom_gb.predict(X_train)

# Evaluate the model using RMSE on training data
rmse = np.sqrt(np.mean((y_train - y_train_pred) ** 2))
print(f'Custom Gradient Boosting RMSE on training data: {rmse}')

Custom Gradient Boosting RMSE on training data: 1171.4597970399616


In [None]:
y_test_pred = custom_gb.predict(X_test)

# Preparing predictions for submission
submission = pd.DataFrame({
    'Item_Identifier': test_data['Item_Identifier'],
    'Outlet_Identifier': test_data['Outlet_Identifier'],
    'Item_Outlet_Sales': y_test_pred
})

# Save predictions to CSV
submission.to_csv('sales_predictions_custom_gb.csv', index=False)
print("Predictions saved to 'sales_predictions_custom_gb.csv'")
