In [17]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from cslib import convert_to_ts

# Set up directories and file path
data_dir = "E:/My-AI-workflow-capstone/data"  # Adjust if needed
csv_file = os.path.join(data_dir,'data.csv')
os.makedirs("figures", exist_ok=True)

# Load CSV data
df_raw = pd.read_csv(csv_file)
df_raw['invoice_date'] = pd.to_datetime(df_raw['invoice_date'])  # Ensure datetime

# Convert to time-series using cslib
df_ts = convert_to_ts(df_raw, country=None)  # Aggregate all data

# Plot 1: Total revenue over time (line plot)
plt.figure(figsize=(8, 5))
df_ts.set_index('date')['revenue'].plot()
plt.title("Total Revenue Over Time")
plt.xlabel("Date")
plt.ylabel("Revenue")
plt.tight_layout()
plt.savefig("figures/total_revenue_time.png")
plt.close()

# Plot 2: Revenue by country (bar plot)
plt.figure(figsize=(8, 5))
country_revenue = df_raw.groupby('country')['price'].sum().sort_values(ascending=False)
sns.barplot(x=country_revenue.values, y=country_revenue.index)
plt.title("Revenue by Country")
plt.xlabel("Revenue")
plt.ylabel("Country")
plt.tight_layout()
plt.savefig("figures/revenue_by_country.png")
plt.close()

# Plot 3: Daily revenue for Nov 2017 (bar plot)
plt.figure(figsize=(8, 5))
nov_dates = pd.date_range("2017-11-28", "2019-07-31")
daily_revenue = [df_ts[df_ts['date'] == date]['revenue'].sum() for date in nov_dates]
plt.bar(nov_dates, daily_revenue, color=["#36A2EB", "#FF6384", "#FFCE56"])
plt.title("Daily Revenue from Nov 2017 to July 2019")
plt.xlabel("Date")
plt.ylabel("Revenue")
plt.ylim(0, max(daily_revenue, default=1) * 1.1)  # Avoid zero-division
plt.tight_layout()
plt.savefig("figures/daily_revenue_from_2017_to_2019.png")
plt.close()

print("EDA plots saved in 'figures/' directory")

EDA plots saved in 'figures/' directory


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

def engineer_features(df):
    """Create time-series features."""
    df['prev_day_revenue'] = df.groupby('country')['revenue'].shift(1)
    df['prev_week_revenue'] = df.groupby('country')['revenue'].shift(7)
    df['prev_month_revenue'] = df.groupby('country')['revenue'].shift(30)
    df['prev_3month_revenue'] = df.groupby('country')['revenue'].shift(90)
    return df.fillna(0)  # Handle missing values

feature_matrix = engineer_features(feature_matrix)

# Split data (last 30 days for testing)
train = feature_matrix[feature_matrix['date'] < feature_matrix['date'].max() - pd.Timedelta(days=30)]
test = feature_matrix[feature_matrix['date'] >= feature_matrix['date'].max() - pd.Timedelta(days=30)]

X_train = train[['prev_day_revenue', 'prev_week_revenue', 'prev_month_revenue', 'prev_3month_revenue']]
y_train = train['revenue']
X_test = test[['prev_day_revenue', 'prev_week_revenue', 'prev_month_revenue', 'prev_3month_revenue']]
y_test = test['revenue']

# Baseline: Mean revenue
baseline_pred = np.full_like(y_test, y_train.mean())
baseline_mse = mean_squared_error(y_test, baseline_pred)

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_pred)

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)

# Visualize comparison
plt.figure(figsize=(10, 6))
plt.bar(['Baseline', 'Linear Regression', 'Random Forest'], [baseline_mse, lr_mse, rf_mse])
plt.title('Model Comparison: Mean Squared Error')
plt.ylabel('MSE')
plt.savefig('model_comparison.png')
plt.close()