<a href="https://colab.research.google.com/github/haaris519/Python-Proti/blob/main/Over_Years.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# sales_data_analysis.py

"""
It includes data cleaning, exploration, KPI calculation,
data visualization, and report generation.

Tools used:
- pandas (data manipulation)
- numpy (numerical computation)
- matplotlib, seaborn (visualization)

Author: Shaik Haaris Saad
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set(style="darkgrid")


data_path = '/content/sales_data_3_years.csv'
if not os.path.exists(data_path):
    raise FileNotFoundError(f"The file '{data_path}' was not found.")

data = pd.read_csv(data_path)

print("\n--- DATA CLEANING ---")

initial_shape = data.shape
data.dropna(inplace=True)
print(f"Removed {initial_shape[0] - data.shape[0]} rows with missing values.")

if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'], errors='coerce')
    data.dropna(subset=['Date'], inplace=True)
else:
    raise ValueError("Missing 'Date' column in dataset.")

data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce')
data['Unit_Price'] = pd.to_numeric(data['Unit_Price'], errors='coerce')
data.dropna(subset=['Quantity', 'Unit_Price'], inplace=True)

data['Total_Sale'] = data['Quantity'] * data['Unit_Price']


print("\n--- KEY METRICS ---")

total_sales = data['Total_Sale'].sum()
print(f"Total Sales: ${total_sales:,.2f}")

avg_sale = data['Total_Sale'].mean()
print(f"Average Sale per Transaction: ${avg_sale:,.2f}")

total_units = data['Quantity'].sum()
print(f"Total Units Sold: {total_units:,}")

print("\n--- TOP PRODUCTS ---")
top_products = data.groupby('Product')['Total_Sale'].sum().sort_values(ascending=False).head(5)
print(top_products.to_string())

print("\n--- SALES OVER TIME ---")
data['Month'] = data['Date'].dt.to_period('M')
sales_over_time = data.groupby('Month')['Total_Sale'].sum()

plt.figure(figsize=(12,6))
sales_over_time.plot(marker='o')
plt.title('Monthly Sales Trend', fontsize=16)
plt.ylabel('Total Sales ($)', fontsize=12)
plt.xlabel('Month', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('sales_over_time.png')
plt.close()

plt.figure(figsize=(10,6))
top_products.plot(kind='bar', color='skyblue')
plt.title('Top 5 Products by Sales', fontsize=16)
plt.ylabel('Total Sales ($)', fontsize=12)
plt.xlabel('Product', fontsize=12)
plt.xticks(rotation=30)
plt.tight_layout()
plt.savefig('top_products.png')
plt.close()


if 'Region' in data.columns:
    region_sales = data.groupby('Region')['Total_Sale'].sum()
    plt.figure(figsize=(8,8))
    region_sales.plot(kind='pie', autopct='%1.1f%%', startangle=140)
    plt.title('Sales Distribution by Region', fontsize=14)
    plt.ylabel('')
    plt.tight_layout()
    plt.savefig('sales_by_region.png')
    plt.close()
else:
    print("Warning: 'Region' column not found. Skipping regional analysis.")

summary = {
    "Total Sales ($)": f"${total_sales:,.2f}",
    "Average Sale/Transaction ($)": f"${avg_sale:,.2f}",
    "Total Units Sold": f"{total_units:,}"
}

summary_df = pd.DataFrame(list(summary.items()), columns=["Metric", "Value"])
summary_df.to_csv('summary_report.csv', index=False)
print("\nSummary report saved as 'summary_report.csv'")

print("\nAll analysis complete. Charts saved as PNG files. You can now include these in your portfolio or report.")



--- DATA CLEANING ---
Removed 0 rows with missing values.

--- KEY METRICS ---
Total Sales: $5,955,289.66
Average Sale per Transaction: $4,962.74
Total Units Sold: 5,981

--- TOP PRODUCTS ---
Product
Mouse         1333061.77
Headphones    1171997.20
Monitor       1155440.96
Laptop        1148039.31
Keyboard      1146750.42

--- SALES OVER TIME ---

Summary report saved as 'summary_report.csv'

All analysis complete. Charts saved as PNG files. You can now include these in your portfolio or report.
