<a href="https://colab.research.google.com/github/kaisarfardin6620/Customer-Shopping-Behavior-Analysis-with-Machine-Learning-Dashboard./blob/main/Customer_behavior_and_shopping_trend.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install dataprep
!pip install sketch

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import sketch
from dataprep.eda import plot, plot_correlation
pd.set_option('display.max_columns', 50)
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/shopping_behavior_updated.csv')

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.sample(5)

In [None]:
df.info()

In [None]:
df.head(5)

In [None]:
df.tail(5)

In [None]:
df.describe()

In [None]:
df.nunique()

In [None]:
df['Item Purchased'].value_counts()

In [None]:
for col in df.columns:
  print(col)
  print(df[col].unique())

In [None]:
df.dtypes

In [None]:
numerical_column = df.select_dtypes(include=np.number).columns
categorical_column = df.select_dtypes(exclude=np.number).columns
print(numerical_column)
print(categorical_column)

In [None]:
plt.figure(figsize=(10,5))
sns.pairplot(df)

In [None]:
plot(df)

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
plot(df, 'Age', 'Purchase Amount (USD)')

In [None]:
plot(df, 'Age', 'Review Rating')

In [None]:
plot(df, 'Age', 'Previous Purchases')

In [None]:
plot(df, 'Category', 'Item Purchased')

In [None]:
plot(df, 'Age', 'Item Purchased')

In [None]:
plot(df, 'Gender', 'Item Purchased')

In [None]:
plot(df, 'Item Purchased', 'Size')

In [None]:
plot(df, 'Category', 'Color')

In [None]:
plot(df, 'Gender', 'Color')

In [None]:
plot(df, 'Gender', 'Size')

In [None]:
plot(df, 'Location', 'Gender')

In [None]:
plot(df, 'Gender', 'Season')

In [None]:
plot(df, 'Gender', 'Payment Method')

In [None]:
plot(df, 'Gender', 'Discount Applied')

In [None]:
plot(df, 'Category', 'Review Rating')

In [None]:
plot(df, 'Item Purchased', 'Frequency of Purchases')

In [None]:
plot(df, 'Category', 'Frequency of Purchases')

In [None]:
px.scatter(df, y='Purchase Amount (USD)', x='Previous Purchases', color='Review Rating', hover_data=['Discount Applied']).show()

In [None]:
data = pd.melt(df, id_vars='Gender', value_vars=['Age', 'Purchase Amount (USD)', 'Review Rating', 'Previous Purchases'])
px.box(data, x='Gender', y='value', color='variable', title='Gender Analysis').show()

In [None]:
categorical_features = ['Gender', 'Category', 'Season', 'Subscription Status']
df_encoded = df.copy()
label_encoders = {}

for col in categorical_features:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
X_categorical = df_encoded[['Gender', 'Category', 'Season']]
y_classification = df_encoded['Subscription Status']
chi_scores = chi2(X_categorical, y_classification)

In [None]:
feature_scores = pd.DataFrame({"Feature": X_categorical.columns, "Chi2 Score": chi_scores[0]})
print(feature_scores)

In [None]:
numerical_features = ['Age', 'Purchase Amount (USD)', 'Frequency of Purchases', 'Previous Purchases', 'Review Rating']
df[numerical_features].hist(figsize=(12, 8), bins=15, color='skyblue', edgecolor='black')
plt.suptitle("Distribution of Numerical Features")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[numerical_features])
plt.title("Boxplot of Numerical Features")
plt.xticks(rotation=45)
plt.show()

In [None]:
categorical_features = ['Gender', 'Category', 'Season', 'Subscription Status']
for feature in categorical_features:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=feature, data=df, palette="viridis")
    plt.title(f"Count Plot of {feature}")
    plt.show()

In [None]:
le = LabelEncoder()

In [None]:
df['Frequency of Purchases'] = le.fit_transform(df['Frequency of Purchases'])

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[numerical_features])

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

In [None]:
df['PCA1'], df['PCA2'] = X_pca[:, 0], X_pca[:, 1]

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PCA1', y='PCA2', data=df, hue='Subscription Status', palette="Set2")
plt.title("PCA Scatter Plot")
plt.show()

In [None]:
print(f"Explained Variance Ratio: {pca.explained_variance_ratio_}")

In [None]:
df['Age_Purchase'] = df['Age'] * df['Purchase Amount (USD)']
df['Frequency_Purchase'] = df['Frequency of Purchases'] / (df['Previous Purchases'] + 1)

In [None]:
print(df[['Age', 'Purchase Amount (USD)', 'Age_Purchase', 'Frequency_Purchase']].head())

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(df[['Age', 'Previous Purchases', 'Frequency of Purchases']])

In [None]:
print("Original Features:", df[['Age', 'Previous Purchases', 'Frequency of Purchases']].shape)
print("Polynomial Features:", X_poly.shape)

In [None]:
numerical_features = ['Age', 'Purchase Amount (USD)', 'Frequency of Purchases', 'Previous Purchases']

In [None]:
scaler = StandardScaler()
X_normalized = scaler.fit_transform(df[numerical_features])

In [None]:
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_normalized)
    inertia.append(kmeans.inertia_)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), inertia, marker='o')
plt.title("Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_normalized)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X_normalized[:, 0], X_normalized[:, 1], c=df['Cluster'], cmap='viridis', s=50)
plt.title("Customer Clusters")
plt.xlabel("Normalized Feature 1")
plt.ylabel("Normalized Feature 2")
plt.colorbar(label="Cluster")
plt.show()

In [None]:
df.sample(5)

In [None]:
X_classification = df[['Age', 'Purchase Amount (USD)', 'Frequency of Purchases', 'Previous Purchases', 'Review Rating']]
y_classification = df['Subscription Status']
y_classification = y_classification.map({'Yes': 1, 'No': 0})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_classification, y_classification, test_size=0.2, random_state=42)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
lg = LogisticRegression()
lg.fit(X_train, y_train)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
svm = SVC()
svm.fit(X_train, y_train)

In [None]:
y_pred = lg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
print("Logistic Regression Accuracy:", accuracy)
print("Logistic Regression Classification Report:\n", classification_rep)
print("Logistic Regression Confusion Matrix:\n", confusion_mat)

In [None]:
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
print("Randomforest Accuracy:", accuracy)
print("Randomforest Classification Report:\n", classification_rep)
print("Randomforest Confusion Matrix:\n", confusion_mat)

In [None]:
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
print("DecisionTree Accuracy:", accuracy)
print("DecisionTree Classification Report:\n", classification_rep)
print("DecisionTree Confusion Matrix:\n", confusion_mat)

In [None]:
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
print("KNN Accuracy:", accuracy)
print("KNN Classification Report:\n", classification_rep)
print("KNN Confusion Matrix:\n", confusion_mat)

In [None]:
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
print("SVM Accuracy:", accuracy)
print("SVM Classification Report:\n", classification_rep)
print("SVM Confusion Matrix:\n", confusion_mat)

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [None]:
y_pred = linear_model.predict(X_test)
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R-Squared (R2):", r2_score(y_test, y_pred))


In [None]:
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R-Squared (R2):", r2_score(y_test, y_pred))


In [None]:
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

In [None]:
y_pred = dt_model.predict(X_test)
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R-Squared (R2):", r2_score(y_test, y_pred))


In [None]:
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [None]:
y_pred = knn_model.predict(X_test)
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R-Squared (R2):", r2_score(y_test, y_pred))


In [None]:
svm_model = SVR(kernel='linear')
svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R-Squared (R2):", r2_score(y_test, y_pred))