In [97]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from plotly.offline import iplot,plot #plotly.offline(generate graph as images)
from plotly.subplots import make_subplots
from warnings import filterwarnings

In [98]:
# Disable filter warning
filterwarnings('ignore')

In [99]:
# Reading dataset
data = pd.read_csv('shopinowDS.csv')
data.sample(5)

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
2981,2982,21,Female,Pants,Clothing,36,Delaware,L,Pink,Winter,4.9,No,Standard,No,No,37,Venmo,Annually
2154,2155,46,Male,Scarf,Accessories,74,Wyoming,M,Violet,Spring,3.4,No,Store Pickup,No,No,33,Cash,Every 3 Months
1424,1425,66,Male,Jacket,Outerwear,23,Connecticut,S,Indigo,Fall,3.5,No,Standard,Yes,Yes,34,Bank Transfer,Every 3 Months
100,101,62,Male,Sunglasses,Accessories,98,South Dakota,M,Maroon,Fall,2.7,Yes,Express,Yes,Yes,31,Cash,Fortnightly
303,304,20,Male,Sandals,Footwear,60,Oregon,M,Turquoise,Summer,3.3,Yes,Next Day Air,Yes,Yes,49,Venmo,Weekly


In [100]:
# Get info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             3900 non-null   int64  
 1   Age                     3900 non-null   int64  
 2   Gender                  3900 non-null   object 
 3   Item Purchased          3900 non-null   object 
 4   Category                3900 non-null   object 
 5   Purchase Amount (USD)   3900 non-null   int64  
 6   Location                3900 non-null   object 
 7   Size                    3900 non-null   object 
 8   Color                   3900 non-null   object 
 9   Season                  3900 non-null   object 
 10  Review Rating           3900 non-null   float64
 11  Subscription Status     3900 non-null   object 
 12  Shipping Type           3900 non-null   object 
 13  Discount Applied        3900 non-null   object 
 14  Promo Code Used         3900 non-null   

In [101]:
# Check null values
data.isnull().sum()

Customer ID               0
Age                       0
Gender                    0
Item Purchased            0
Category                  0
Purchase Amount (USD)     0
Location                  0
Size                      0
Color                     0
Season                    0
Review Rating             0
Subscription Status       0
Shipping Type             0
Discount Applied          0
Promo Code Used           0
Previous Purchases        0
Payment Method            0
Frequency of Purchases    0
dtype: int64

In [102]:
# Check duplicated values
data.duplicated().sum()

0

In [103]:
# Check duplicated rows
data.duplicated().any()

False

In [104]:
# Clean dataset
data.drop(['Customer ID'],axis=1,inplace=True)

In [105]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     3900 non-null   int64  
 1   Gender                  3900 non-null   object 
 2   Item Purchased          3900 non-null   object 
 3   Category                3900 non-null   object 
 4   Purchase Amount (USD)   3900 non-null   int64  
 5   Location                3900 non-null   object 
 6   Size                    3900 non-null   object 
 7   Color                   3900 non-null   object 
 8   Season                  3900 non-null   object 
 9   Review Rating           3900 non-null   float64
 10  Subscription Status     3900 non-null   object 
 11  Shipping Type           3900 non-null   object 
 12  Discount Applied        3900 non-null   object 
 13  Promo Code Used         3900 non-null   object 
 14  Previous Purchases      3900 non-null   

In [106]:
#AGE
data_age=data['Age'].value_counts().sort_index()

In [107]:
print(f"Least Age Purchase items '{data_age.index[0]}' with repeat '{data_age.iloc[0]}'")
print(f"Most Age Purchase items '{data_age.index[-1]}' with repeat '{data_age.iloc[-1]}'")

Least Age Purchase items '18' with repeat '69'
Most Age Purchase items '70' with repeat '67'


In [108]:
iplot(px.line(data_age,
              labels={'Value':'Repeat'},
              template='plotly_dark',
              markers=True,
              title='Ages purchased the items'
              ))

In [109]:
#GENDER
data_gender=data['Gender'].value_counts()

In [110]:
iplot(px.pie(values=data_gender,
             names=data_gender.index,
             template='plotly_dark',
             title='Male vs Femal Purchased items'
             ).update_traces(textinfo='label+percent'))

In [111]:
#MEAN AGE BY GENDER
data_gender_age = data.groupby('Gender')['Age'].mean()

In [112]:
iplot(px.pie(values=data_gender_age,
             names=data_gender_age.index,
             template='plotly_dark',
             title='Mean of age of people purchased items'
             ).update_traces(textinfo='label+percent+value'))

In [113]:
#ITEMS PURCHASED
data_items_purchased = data['Item Purchased'].value_counts().sort_values(ascending=True)

In [114]:
iplot(px.bar(data_items_purchased,
             labels={'values':'count'},
             color=data_items_purchased.index,
             template='plotly_dark',
             text_auto=True,
             title='Count of items purchased'))

In [115]:
#CATEGORY
data_category = data['Category'].value_counts()

In [116]:
iplot(px.pie(values=data_category,
             names=data_category.index,
             template='plotly_dark',
             title='Count of items sold by category'
             ).update_traces(textinfo='label+percent'))

In [126]:
#REVIEW FOR CATEGORY
data_reviwew_category = data.groupby(['Category','Item Purchased'])['Review Rating'].mean().round(2)

In [127]:
category = data['Category'].unique().tolist()
colors = ['#FF5733', '#3498db', '#27ae60', '#f39c12']
j = 0
for c in category:
    iplot(px.line(data_reviwew_category.get(c),
                  template='plotly_dark',
                  color_discrete_sequence=[colors[j]],
                  markers=True,
                  labels={'value':'Review Rating'},
                  title=f'Review rating for {c} category'
                  ))
    j+=1

In [137]:
#PURCHASED AMOUNT BY CATEGORY
data_purchased_amount_category = data.groupby('Category')['Purchase Amount (USD)'].sum().round(2)

In [138]:
iplot(px.bar(data_purchased_amount_category,
             labels={'value':'Purchase amount'},
             template='plotly_dark',
             text_auto=True,
             color=data_purchased_amount_category.index))

In [141]:
#PURCHASED AMOUNT BY CATEGORY AND ITEMS PURCHASED
data_purchased_amount_category_item = data.groupby(['Category','Item Purchased'])['Purchase Amount (USD)'].sum().round(2)

In [190]:
for c in category:
    iplot(px.bar(data_purchased_amount_category_item.get(c),
                 labels={'value':'Amount purchase'},
                 template='plotly_dark',
                 text_auto=True,
                 color=data_purchased_amount_category_item.get(c).index,
                   title=f'Sum of Purchase Amount (USD) for Items Purchased for Category {c}'
                 ))

In [164]:
#LOCATION
data_location = data['Location'].value_counts().sort_values(ascending=False)[:25]

In [165]:
iplot(px.bar(data_location,
             orientation='h',
             labels={'value:Count'},
             color=data_location.index,
             template='plotly_dark'))

In [179]:
#SIZE
data_size = data.groupby(['Category','Item Purchased'])['Size'].value_counts().unstack(fill_value=0).reset_index()

In [180]:
data_size=data_size.groupby('Category')

In [181]:
for c in category:
    iplot(px.line(data_size.get_group(c),
                  labels={'value':'count'},
                  x='Item Purchased',
                  y=['L','M','S','XL'],
                  template='plotly_dark',
                  markers=True))

In [183]:
#COLORS
data_color = data.groupby(['Category','Item Purchased'])['Color'].value_counts()
data_color = data_color.reset_index().groupby('Category')

In [184]:
for c in category:
    iplot(px.scatter(data_color.get_group(c),
                     x='Item Purchased',
                     y='Color',
                     color='count',
                     title=f'Top Colors of Items Purchased for Category {c}',
                     template='plotly_dark'
                 ))

In [186]:
#SEASON
data_season = data['Season'].value_counts()

In [187]:
iplot(px.pie(values=data_season,
             names=['Spring','Fall','Winter','Summer'],
             template='plotly_dark',
             title='Season',
).update_traces(textinfo='label+percent'))

In [188]:
#SUBSCRIPTION STATUS
data_subscription_status = data['Subscription Status'].value_counts()

In [189]:
iplot(px.pie(values=data_subscription_status,
             names=['No','Yes'],
             template='plotly_dark',
             title='Subscription Status',
).update_traces(textinfo='label+percent'))

In [192]:
#SHIPPING TYPE
data_shipping_type = data['Shipping Type'].value_counts()

In [193]:
iplot(px.bar(data_shipping_type,
                 title="Shipping Type",
                 template='plotly_dark',
                 text_auto=True,
                 color=data_shipping_type.index,
                 labels={'value':'Count'}
                 ))

In [194]:
#DISCOUNT APPLIED
data_discount_applied= data['Discount Applied'].value_counts()

In [195]:
iplot(px.pie(values=data_discount_applied,
             names=['No','Yes'],
             template='plotly_dark',
             title='Discount Applied',
).update_traces(textinfo='label+percent'))

In [196]:
#PROMO CODE USED
data_promo_code_used = data['Promo Code Used'].value_counts()

In [197]:
iplot(px.pie(values=data_promo_code_used,
             names=['No','Yes'],
             template='plotly_dark',
             title='Promo Code Used',
).update_traces(textinfo='label+percent'))

In [198]:
#PAYMENT METHOD
data_payment_method = data['Payment Method'].value_counts()

In [199]:
iplot(px.bar(data_payment_method,
                 title="Payment Method",
                 template='plotly_dark',
                 text_auto=True,
                 color=data_payment_method.index,
                 labels={'value':'Count'}
                 ))

In [200]:
#FREQUENCY OF PURCHASED
data_frequency_purchased = data['Frequency of Purchases'].value_counts()

In [201]:
iplot(px.bar(data_frequency_purchased,
                 title="Frequency of Purchases",
                 template='plotly_dark',
                 text_auto=True,
                 color=data_frequency_purchased.index,
                 labels={'value':'Count'}
                 ))