In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import networkx as nx

In [None]:
customers = pd.read_csv(r"E:\assignment zeotap\all_tasks\Customers.csv")
products = pd.read_csv(r"E:\assignment zeotap\all_tasks\Products.csv")
transactions = pd.read_csv(r"E:\assignment zeotap\all_tasks\Transactions.csv")

In [None]:
print(transactions.columns)  # Check for the actual column names

In [None]:
print(customers.info())  # Check data types
print(customers.isnull().sum())  # Check missing values
print(transactions.describe())  # Summary statistics

In [None]:
# Display basic information about the datasets
transactions_info = transactions.info()
customers_info = customers.info()
products_info = products.info()

In [None]:
# Check for missing values
missing_values = {
    "Transactions": transactions.isnull().sum(),
    "Customers": customers.isnull().sum(),
    "Products": products.isnull().sum(),
}

In [None]:
# Display basic statistics
transactions_stats = transactions.describe()
customers_stats = customers.describe()
products_stats = products.describe()

In [None]:
# Display first few rows
transactions_head = transactions.head()
customers_head = customers.head()
products_head = products.head()

In [None]:
transactions_info, customers_info, products_info, missing_values, transactions_stats, customers_stats, products_stats, transactions_head, customers_head, products_head

In [None]:
# Fill missing values with appropriate strategies
transactions.fillna({'Quantity': 1, 'TotalValue': transactions['Price']}, inplace=True)
customers.fillna({'Region': 'Unknown'}, inplace=True)
products.fillna({'Category': 'Misc'}, inplace=True)

In [None]:
'''--- BUSINESS INSIGHT 1 
1. Customer Segmentation
K-Means clustering identified four distinct customer groups based on spending and purchase frequency. 
This segmentation highlights unique behavioral patterns, enabling tailored marketing strategies. 
High-value customers can be prioritized for premium services, while less-engaged groups may benefit from targeted promotions.'''

In [None]:
# Feature Engineering for Customer Segmentation
customer_spending = transactions.groupby("CustomerID")[["TotalValue", "Quantity"]].sum()
customer_data = customers.merge(customer_spending, on="CustomerID", how="left").fillna(0)

In [None]:
# Normalize data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_data[['TotalValue', 'Quantity']])

In [None]:
# Apply K-Means Clustering
kmeans = KMeans(n_clusters=4, random_state=42)
customer_data['Cluster'] = kmeans.fit_predict(scaled_features)

In [None]:
# Cluster insights
print(customer_data.groupby("Cluster").agg({"TotalValue": ["mean", "sum"], "Quantity": ["mean", "sum"]}))

In [None]:
'''--- BUSINESS INSIGHT 2 
2. Sales Trends
Monthly sales trend analysis revealed significant revenue fluctuations. 
The visualization suggests a focus on peak months for campaigns and optimizing inventory during slower periods. 
Seasonal sales insights guide resource allocation and promotional planning effectively.'''

In [None]:
# Convert transaction date to datetime
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"])
transactions.set_index("TransactionDate", inplace=True)

In [None]:
# Resample sales data by month
monthly_sales = transactions.resample('M').sum()

In [None]:
# Plot the trend
plt.figure(figsize=(12, 6))
sns.lineplot(data=monthly_sales, x=monthly_sales.index, y="TotalValue", marker="o")
plt.title("Monthly Sales Trend")
plt.xlabel("Date")
plt.ylabel("Revenue")
plt.show()

In [None]:
'''--- BUSINESS INSIGHT 3
3. Profitable Products:
Top 10 products contributing to revenue were identified through transaction data. 
Visual analysis highlights their demand and profitability, offering insights for optimizing inventory and focusing on high-margin products during marketing.'''

In [None]:
# Find total sales per product
product_sales = transactions.groupby("ProductID")["TotalValue"].sum().reset_index()

In [None]:
# Merge with product details
product_sales = product_sales.merge(products, on="ProductID")

In [None]:
# Plot top-selling products
plt.figure(figsize=(12, 6))
sns.barplot(data=product_sales.nlargest(10, "TotalValue"), x="ProductName", y="TotalValue", palette="coolwarm")
plt.xticks(rotation=45)
plt.title("Top 10 Most Profitable Products")
plt.show()

In [None]:
'''--- BUSINESS INSIGHT 4
4. Revenue Prediction:
Linear regression model trained to predict revenue using quantity and price as features.
Performance metrics: MAE=, MSE=, R²=** show reasonable accuracy for future forecasting.'''

In [None]:
#load the dataset
transactions = pd.read_csv(r"E:\assignment zeotap\all_tasks\Transactions.csv")

In [None]:
# Convert date column to datetime format
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"])

In [None]:
# Ensure correct data types
transactions["TotalValue"] = transactions["TotalValue"].astype(float)
transactions["Quantity"] = transactions["Quantity"].astype(int)
transactions["Price"] = transactions["Price"].astype(float)

In [None]:
# Revenue Over Time (Line Plot)
plt.figure(figsize=(10, 6))
daily_revenue = transactions.groupby("TransactionDate")["TotalValue"].sum()
daily_revenue.plot(title="Daily Revenue Trend", marker="o", linestyle="-", color="b")
plt.xlabel("Date")
plt.ylabel("Total Revenue")
plt.grid()
plt.show()

In [None]:
# Revenue Distribution (Histogram)
plt.figure(figsize=(8, 5))
sns.histplot(transactions["TotalValue"], bins=30, kde=True, color="green")
plt.title("Revenue Distribution")
plt.xlabel("Revenue per Transaction")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Price vs Quantity Distribution (Box Plot)
plt.figure(figsize=(10, 6))
sns.boxplot(x=transactions["Quantity"], y=transactions["Price"], palette="coolwarm")
plt.title("Price vs Quantity Distribution")
plt.xlabel("Quantity")
plt.ylabel("Price")
plt.show()

In [None]:
# Feature selection for revenue prediction
X = transactions[["Quantity", "Price"]]
y = transactions["TotalValue"]

In [None]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Predict revenue
predictions = model.predict(X_test)

In [None]:
# Model Evaluation
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

In [None]:
# Print Model Performance
print(f"Model Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-Squared (R²): {r2:.2f}")

In [None]:
# Actual vs Predicted Revenue (Box Plot)
plt.figure(figsize=(8, 5))
sns.boxplot(data=[y_test, predictions], palette=["blue", "orange"])
plt.xticks([0, 1], ["Actual Revenue", "Predicted Revenue"])
plt.title("Actual vs Predicted Revenue")
plt.show()

In [None]:
'''--- BUSINESS INSIGHT 5
5. Customer-Product Interaction
A network graph of customer-product relationships shows high-frequency transactions. 
Strong links highlight customer loyalty towards specific products, guiding personalized offers and retention strategies.'''

In [None]:
G = nx.Graph()

In [None]:
# Add nodes (Customers & Products)
for customer in transactions["CustomerID"].unique():
    G.add_node(customer, type="customer")
for product in transactions["ProductID"].unique():
    G.add_node(product, type="product")

In [None]:
# Add edges (Transactions)
for _, row in transactions.iterrows():
    G.add_edge(row["CustomerID"], row["ProductID"], weight=row["Quantity"])

In [None]:
# Draw Graph
plt.figure(figsize=(10, 8))
nx.draw(G, with_labels=False, node_size=20, alpha=0.6, edge_color="gray")
plt.title("Customer-Product Network")
plt.show()