In [1]:
import pandas as pd

In [2]:
# Load the data
df = pd.read_csv("supermarket.csv")

# Convert the order date to datetime
df["Order Date"] = pd.to_datetime(df["Order Date"])

In [3]:
# Extract year from 'order_date'
df["year"] = df["Order Date"].dt.year

# Filter for the year 2015
df_2015 = df[df["year"] == 2015]

# Calculate total revenue per customer for 2015
total_revenue_2015 = (
    df_2015.groupby("Customer ID")["Revenue"].sum().reset_index(name="revenue_2015")
)

# Calculate average order value per customer for 2015
average_order_value_2015 = (
    df_2015.groupby("Customer ID")["Revenue"]
    .mean()
    .reset_index(name="average_order_value_2015")
)

# Calculate total number of purchases per customer for 2015
total_purchases_2015 = (
    df_2015.groupby("Customer ID").size().reset_index(name="total_purchases_2015")
)

# Find the most recent purchase date per customer for 2015
most_recent_purchase_2015 = df_2015.groupby("Customer ID")["Order Date"].max()

# Calculate the number of days since the most recent purchase from the last day of 2015
days_since_purchase = (pd.Timestamp("2015-12-31") - most_recent_purchase_2015).dt.days

df_days_since_purchase = pd.DataFrame(days_since_purchase).reset_index()
df_days_since_purchase.columns = ["Customer ID", "days_since_last_purchase_2015"]

# Find the first purchase date per customer
first_purchase = df.groupby("Customer ID")["Order Date"].min()

# Calculate the number of days since the first purchase from the last day of 2015
days_since_first_purchase = (pd.Timestamp("2015-12-31") - first_purchase).dt.days

# Create a new DataFrame
df_days_since_first_purchase = pd.DataFrame(days_since_first_purchase).reset_index()
df_days_since_first_purchase.columns = [
    "Customer ID",
    "days_since_first_purchase_till_2015",
]

df_2016 = df[df["year"] == 2016]

# Calculate total revenue per customer for 2015
total_revenue_2016 = (
    df_2016.groupby("Customer ID")["Revenue"].sum().reset_index(name="revenue_2016")
)

customer_revenue = total_revenue_2016.merge(
    total_revenue_2015, on="Customer ID", how="inner"
)
customer_revenue = customer_revenue.merge(
    average_order_value_2015, on="Customer ID", how="inner"
)
customer_revenue = customer_revenue.merge(
    total_purchases_2015, on="Customer ID", how="inner"
)
customer_revenue = customer_revenue.merge(
    df_days_since_purchase, on="Customer ID", how="inner"
)
customer_revenue = customer_revenue.merge(
    df_days_since_first_purchase, on="Customer ID", how="inner"
)

In [4]:
# customer_revenue

In [5]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

x_cols = customer_revenue.columns[2:]
X = customer_revenue[x_cols]
y = customer_revenue["revenue_2016"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)
rfe = RFE(estimator=LinearRegression(), n_features_to_select=5)
rfe.fit(X_train, y_train)
for featureNum in range(X_train.shape[1]):
    # If feature was selected
    if rfe.support_[featureNum] == True:
        # Print feature name and rank
        print(
            "Feature: {}, Rank: {}".format(
                X_train.columns[featureNum], rfe.ranking_[featureNum]
            )
        )

Feature: revenue_2015, Rank: 1
Feature: average_order_value_2015, Rank: 1
Feature: total_purchases_2015, Rank: 1
Feature: days_since_last_purchase_2015, Rank: 1
Feature: days_since_first_purchase_till_2015, Rank: 1


In [6]:
from sklearn.metrics import mean_squared_error

rfe_model = LinearRegression()
rfe_model.fit(X_train, y_train)
rfe_predictions = rfe_model.predict(X_test)
print(mean_squared_error(rfe_predictions, y_test) ** 0.5)

22057.037433420915
