# Amazon Sales Data Analysis & Machine Learning
### Objectives:
1. Perform Exploratory Data Analysis (EDA) to uncover trends and insights.
2. Predict sales revenue using a regression model.
3. Segment customers using clustering techniques.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

## Load Dataset

In [None]:
from google.colab import files
uploaded = files.upload()

# Read the dataset
file_name = list(uploaded.keys())[0]  # Get uploaded file name
df = pd.read_csv(file_name)

# Display basic information
print("Dataset Info:")
print(df.info())

## Data Cleaning & Preprocessing

In [None]:
# Drop unnecessary columns
df_cleaned = df.drop(columns=['Unnamed: 22', 'fulfilled-by'], errors='ignore')

# Convert Date column to datetime format
df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'], errors='coerce')

# Drop rows where Amount is missing
df_cleaned = df_cleaned.dropna(subset=['Amount'])

# Fill missing categorical values with 'Unknown'
categorical_cols = ['Courier Status', 'currency', 'ship-city', 'ship-state', 'ship-country']
df_cleaned[categorical_cols] = df_cleaned[categorical_cols].fillna('Unknown')

## Exploratory Data Analysis (EDA)

In [None]:
sns.set_style("whitegrid")

# Sales trend over time
plt.figure(figsize=(12, 6))
df_cleaned.groupby('Date')['Amount'].sum().plot(color='blue', linewidth=2)
plt.title("Sales Trend Over Time")
plt.xlabel("Date")
plt.ylabel("Total Sales Amount")
plt.xticks(rotation=45)
plt.show()

## Machine Learning: Sales Prediction (Regression)

In [None]:
df_ml = df_cleaned[['Qty', 'Amount', 'Fulfilment']]
df_ml = pd.get_dummies(df_ml, columns=['Fulfilment'], drop_first=True)

X = df_ml.drop(columns=['Amount'])  # Features
y = df_ml['Amount']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluate Model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Sales Prediction Model Performance:\nMAE: {mae:.2f}\nR² Score: {r2:.4f}")

## Machine Learning: Customer Segmentation (Clustering)

In [None]:
df_cluster = df_cleaned[['Qty', 'Amount']]

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_cluster)

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df_cleaned['Customer_Segment'] = kmeans.fit_predict(df_scaled)

# Plot Customer Segments
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df_cleaned['Qty'], y=df_cleaned['Amount'], hue=df_cleaned['Customer_Segment'], palette="Set1")
plt.title("Customer Segments Based on Purchase Behavior")
plt.xlabel("Quantity Purchased")
plt.ylabel("Amount Spent")
plt.legend(title="Segment")
plt.show()

## Insights & Recommendations:
- The sales trend shows seasonal spikes.
- Certain product categories contribute significantly to revenue.
- Customers can be segmented based on their purchase behavior, allowing for targeted marketing.
- The regression model needs improvement by incorporating more features (e.g., category, discounts).