# **1-Import libraries**

In [1]:
# Import libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/retail-sales-analysis/Historical_Retail_Sales_Data.csv


 # **2 - Upload dataset**

In [2]:
#Download and Read the csv file
df = pd.read_csv('/kaggle/input/retail-sales-analysis/Historical_Retail_Sales_Data.csv', sep=';')

In [3]:
df.head()

Unnamed: 0,Transaction,Date,Item,Amount,Store Location,Country
0,T9218,01/06/24,Butter,13.03,Los Angeles,United States
1,T6324,01/06/24,Banana,14.54,Houston,United States
2,T3826,01/06/24,Cheese,7.42,Los Angeles,United States
3,T9061,01/06/24,Eggs,9.47,New York,United States
4,T2675,01/06/24,Apple,6.12,San Francisco,United States


In [4]:
# Get the shape of the DataFrame
df.shape

(915, 6)

In [5]:
# Display information about the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 915 entries, 0 to 914
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Transaction     915 non-null    object 
 1   Date            915 non-null    object 
 2   Item            915 non-null    object 
 3   Amount          915 non-null    float64
 4   Store Location  915 non-null    object 
 5   Country         915 non-null    object 
dtypes: float64(1), object(5)
memory usage: 43.0+ KB


# **3 - Clean Data**

In [6]:
# Duplicate rows
df.duplicated().sum()

0

In [7]:
df.isnull().sum()

Transaction       0
Date              0
Item              0
Amount            0
Store Location    0
Country           0
dtype: int64

# **4. Analysis and Data Visualization**

In [8]:
items_bystorelocations = df.groupby(['Store Location'])['Item'].count().reset_index()
items_bystorelocations

Unnamed: 0,Store Location,Item
0,Chicago,172
1,Houston,181
2,Los Angeles,224
3,New York,174
4,San Francisco,164


In [9]:
fig = px.bar(items_bystorelocations , x = 'Store Location', y = 'Item', height = 400,
             title = 'Items by Store Locations')
fig.show()

In [10]:
amount_bystorelocations = df.groupby(['Store Location'])['Amount'].sum().reset_index()
amount_bystorelocations

Unnamed: 0,Store Location,Amount
0,Chicago,1629.63
1,Houston,1802.23
2,Los Angeles,2474.25
3,New York,1839.59
4,San Francisco,1711.29


In [11]:
fig = px.bar(amount_bystorelocations , x = 'Store Location', y = 'Amount', height = 400,
             title = 'Amount by Store Locations')
fig.show()

In [12]:
amount_byitems = df.groupby(['Item'])['Amount'].sum().reset_index()
amount_byitems

Unnamed: 0,Item,Amount
0,Apple,1079.49
1,Banana,1278.99
2,Bread,1220.56
3,Butter,1233.89
4,Cheese,1107.81
5,Eggs,1196.24
6,Milk,1168.16
7,Orange,1171.85


In [13]:
fig = px.pie(amount_byitems , values = 'Amount' , names = 'Item'  , title = 'Percentage of Amount by Items')
fig.show()

In [14]:
amount_byTransaction = df.groupby(['Transaction'])['Amount'].sum().reset_index()
amount_byTransaction

Unnamed: 0,Transaction,Amount
0,T1038,2.23
1,T1045,8.16
2,T1049,0.85
3,T1057,15.49
4,T1058,0.99
...,...,...
870,T9915,9.58
871,T9945,18.16
872,T9960,19.97
873,T9976,8.93


In [15]:
sorted_Transaction = amount_byTransaction.sort_values(by='Amount',ascending=False)
# Select the top 10 items
top_10_Transaction = sorted_Transaction.head(10)

top_10_Transaction

Unnamed: 0,Transaction,Amount
829,T9501,42.65
338,T4423,36.93
437,T5431,34.25
353,T4548,33.47
770,T8805,32.43
117,T2212,31.77
563,T6707,29.73
779,T8910,28.74
448,T5624,28.42
247,T3429,27.45


In [16]:
fig = px.bar(top_10_Transaction , x = 'Transaction', y = 'Amount', height = 400,
             title = 'Top 10 Transaction')
fig.show()