# Pandas analysis part 3


## Setup

In [26]:
import pandas as pd

## Data

### Import data

In [27]:
df = pd.read_csv('https://raw.githubusercontent.com/kirenz/lab-competitive/main/code/ecommerce.csv')

### Data structure

In [28]:
df.head()

Unnamed: 0,eshop_name,date,annual_revenue,time_on_site,average_rating,social_media_followers,average_response_time
0,E-ShopA,2020-01-31,13.35,1.09,4.17,173.76,2.35
1,E-ShopA,2020-02-29,10.74,0.56,4.79,52.69,2.58
2,E-ShopA,2020-03-31,11.91,0.57,2.92,141.79,1.54
3,E-ShopA,2020-04-30,16.38,2.44,3.68,190.57,1.92
4,E-ShopA,2020-05-31,6.52,2.07,2.67,129.49,1.49


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   eshop_name              108 non-null    object 
 1   date                    108 non-null    object 
 2   annual_revenue          108 non-null    float64
 3   time_on_site            108 non-null    float64
 4   average_rating          108 non-null    float64
 5   social_media_followers  108 non-null    float64
 6   average_response_time   108 non-null    float64
dtypes: float64(5), object(2)
memory usage: 6.0+ KB


### Data corrections

In [30]:
df['date'] = pd.to_datetime(df['date'])
df['eshop_name'] = df['eshop_name'].astype('category')

## Task 1

Average Revenue by E-Shop

- Show the average revenue for all shops
- Use the name `average_revenue`

In [31]:
# Average Revenue by E-Shop
df_avg_revenue = df.groupby('eshop_name')['annual_revenue'].mean().reset_index().rename(columns={'annual_revenue':'average_revenue'})

df_avg_revenue

Unnamed: 0,eshop_name,average_revenue
0,E-ShopA,33.483333
1,E-ShopB,29.222778
2,E-ShopC,30.625


## Task 2

E-Shop with the Highest Average Rating

- Only show the E-Shop with the highest average rating
- Use the name `average_rating`

In [32]:
df_best_rating = df.groupby('eshop_name')['average_rating'].mean().reset_index().sort_values('average_rating', ascending=False).head(1)

df_best_rating

Unnamed: 0,eshop_name,average_rating
0,E-ShopA,4.339167


## Task 3 

E-Shop Performance Over Time 

- Show the annual revenue per E-Shop by year
- Use the names `year` and `total_revenue`

In [33]:
df['year'] = df['date'].dt.year

df_revenue_by_year = df.groupby(['eshop_name', 'year'])['annual_revenue'].sum().reset_index().rename(columns={'annual_revenue':'total_revenue'})
df_revenue_by_year

Unnamed: 0,eshop_name,year,total_revenue
0,E-ShopA,2020,196.02
1,E-ShopA,2021,377.07
2,E-ShopA,2022,632.31
3,E-ShopB,2020,189.2
4,E-ShopB,2021,308.77
5,E-ShopB,2022,554.05
6,E-ShopC,2020,200.91
7,E-ShopC,2021,363.94
8,E-ShopC,2022,537.65


## Task 4

Maximum Social Media Followers

- Show the maximum amount of social media followers for every E-shop in a descending order.
- Use the name `max_followers`

In [34]:
df_most_followers = df.groupby('eshop_name')['social_media_followers'].max().reset_index().rename(columns={'social_media_followers':'max_followers'}).sort_values('max_followers', ascending=False).head(1)

df_most_followers

Unnamed: 0,eshop_name,max_followers
2,E-ShopC,1529.19


## Task 5

Monthly Time on Site overview

- Show a monthly overview of the average time on site for every E-shop (order by E-shop and month). We don't care about the years.
- Use the names `month` and `average_time_on_site`

In [35]:
df['month'] = df['date'].dt.month
df_user_growth = df.groupby(['eshop_name', 'month'])['time_on_site'].mean().reset_index().rename(columns={'time_on_site':'average_time_on_site'})

df_user_growth

Unnamed: 0,eshop_name,month,average_time_on_site
0,E-ShopA,1,5.266667
1,E-ShopA,2,3.813333
2,E-ShopA,3,5.186667
3,E-ShopA,4,6.67
4,E-ShopA,5,5.36
5,E-ShopA,6,7.266667
6,E-ShopA,7,6.11
7,E-ShopA,8,3.676667
8,E-ShopA,9,7.25
9,E-ShopA,10,7.033333
