# Big Data Analysis Project

Dataset: Online Retail Transactions

This notebook covers preprocessing, EDA, big data processing with Dask, and clustering.

In [None]:

# Install required libraries (if needed)
!pip install dask[complete] scikit-learn matplotlib seaborn


In [None]:

import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


## Data Loading

In [None]:

# Load dataset
df = dd.read_csv('/mnt/data/data.csv')
df.head()


## Data Preprocessing

In [None]:

# Remove duplicates
df = df.drop_duplicates()

# Handle missing values
df = df.dropna(subset=['CustomerID'])

# Convert InvoiceDate to datetime
df['InvoiceDate'] = dd.to_datetime(df['InvoiceDate'])


## Feature Engineering (RFM)

In [None]:

# Compute RFM features
snapshot_date = df['InvoiceDate'].max().compute()

rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'InvoiceNo': 'count',
    'UnitPrice': 'sum'
})

rfm.columns = ['Recency', 'Frequency', 'Monetary']
rfm = rfm.compute()
rfm.head()


## Clustering

In [None]:

scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm)

kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(rfm_scaled)

rfm['Cluster'] = clusters


## Evaluation

In [None]:

score = silhouette_score(rfm_scaled, clusters)
print("Silhouette Score:", score)


## Visualization

In [None]:

rfm['Cluster'].value_counts().plot(kind='bar')
plt.title('Customer Segments')
plt.show()
