In [None]:
# import sys
# !{sys.executable} -m pip install pandas

# import sys
# !{sys.executable} -m pip install --upgrade pip

In [None]:
# import sys
# !{sys.executable} -m pip install matplotlib scikit-learn


In [2]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer

# Set plot style
sns.set(style="whitegrid")


## Data Overview

In [3]:
df = pd.read_csv(r"C:\Users\fakisseh\Downloads\MACPROS\archive\sales_data_sample.csv", encoding='latin1')

## Data Cleaning

In [None]:
df.head()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ORDERNUMBER       2823 non-null   int64  
 1   QUANTITYORDERED   2823 non-null   int64  
 2   PRICEEACH         2823 non-null   float64
 3   ORDERLINENUMBER   2823 non-null   int64  
 4   SALES             2823 non-null   float64
 5   ORDERDATE         2823 non-null   object 
 6   STATUS            2823 non-null   object 
 7   QTR_ID            2823 non-null   int64  
 8   MONTH_ID          2823 non-null   int64  
 9   YEAR_ID           2823 non-null   int64  
 10  PRODUCTLINE       2823 non-null   object 
 11  MSRP              2823 non-null   int64  
 12  PRODUCTCODE       2823 non-null   object 
 13  CUSTOMERNAME      2823 non-null   object 
 14  PHONE             2823 non-null   object 
 15  ADDRESSLINE1      2823 non-null   object 
 16  ADDRESSLINE2      302 non-null    object 


In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(["ADDRESSLINE2","TERRITORY","ORDERNUMBER","PHONE","CONTACTLASTNAME","CONTACTFIRSTNAME","STATE","POSTALCODE"],axis="columns")

In [None]:
df.info()

In [None]:
df.head()

## Data Preparation (Feature Engineering)

In [None]:
df["ORDERDATE"] = pd.to_datetime(df["ORDERDATE"])
df["DAY"] = df["ORDERDATE"].dt.day

In [None]:
# feature columns to train model
feature = ["QUANTITYORDERED","PRICEEACH","MSRP","YEAR_ID","MONTH_ID","DAY","QTR_ID","PRODUCTLINE","COUNTRY","DEALSIZE"]

In [None]:
df_model = df[feature + ["SALES"]]
df_encoded = pd.get_dummies(df_model,columns=["PRODUCTLINE","DEALSIZE","COUNTRY"],drop_first=True)

In [None]:
X = df_encoded.drop(columns="SALES")
y = df_encoded["SALES"]

## Train/Test Split

In [None]:
#splitting data for training and testing
X_train, X_test, y_train,y_test =train_test_split(X,y,test_size=0.2,random_state=42)

## Model Training

### Baseline

In [None]:
# Building baseline model
linear = LinearRegression()

In [None]:
# fitting model in baseline
linear.fit(X_train,y_train)

### Stronger

In [None]:
rf_reg = RandomForestRegressor(n_estimators=300,max_depth=None,random_state=42)

In [None]:
rf_reg.fit(X_train,y_train)

## Evaluation

In [None]:
def eval_model(model, name):
    pred = model.predict(X_test)
    print(f"{name}")
    print("  MAE :", mean_absolute_error(y_test, pred))
    print("  RMSE:", np.sqrt(mean_squared_error(y_test, pred)))
    print("  R²  :", r2_score(y_test, pred), "\n")

eval_model(linear, "Linear Regression")
eval_model(rf_reg,  "Random Forest")

## Feature Importance

In [None]:
importances = pd.Series(rf_reg.feature_importances_, index=X.columns)
top_feats  = importances.sort_values(ascending=False).head(15)

plt.figure(figsize=(8,6))
sns.barplot(x=top_feats, y=top_feats.index)
plt.title("Random Forest – Top Feature Importances")
plt.xlabel("Importance")
plt.ylabel("")
plt.tight_layout()
plt.show()
