In [None]:
# Import Libraries and Dependencies
import pandas as pd

### Read in files

In [None]:
# Read in data and index by CustomerID
jan = pd.read_csv('Resources/Jan2019_sales.csv', index_col='order_ID')
feb = pd.read_csv('Resources/Feb2019_sales.csv', index_col='order_ID')
mar = pd.read_csv('Resources/March2019_sales.csv', index_col='order_ID')

In [None]:
# Show sample of January sales data.
jan.head(5)

In [None]:
# Show sample of February sales data.
feb.head(5)

In [None]:
# Show sample of March sales data.
mar.head(5)

### Check the data types of each DataFrame

In [None]:
# Jan sales datatypes
jan.info()

In [None]:
# Feb sales datatypes
feb.info()

In [None]:
# March sales datatypes
mar.info()

### Combine the sales data by rows.

In [None]:
# Combine the January, February, and March sales DataFrames on the rows using an inner join, and reset the index
sales = pd.concat([jan, feb,mar], axis='rows',join = 'inner', ignore_index=False).reset_index()

#unit test - check if joined properly
# sum of all data frame count
total_count = jan.shape[0] + feb.shape[0] + mar.shape[0]  

# Count after all data frame combined 
# Assuming 'sales' is a DataFrame or any other data structure with rows representing sales data

# Get the total number of rows in the 'sales' DataFrame
total_joined_count = sales.shape[0]   

# Check if the total number of rows in 'sales' matches the expected total count
if total_count == total_joined_count:
    # If the counts match, print a message indicating successful joining
    print(f"Joined properly. Total count: {total_count} == Joined count: {total_joined_count}")
else:
    # If the counts do not match, print an error message indicating a problem in joining
    print(f'Error on joining. Total count: {total_count} != Joined count: {total_joined_count}')


sales.head(2)

In [None]:
# Check if any values are null.
sales.isna().sum()


There are no missing values, no need for removing or imputing 

In [None]:
# Check the data type of each column
sales.info()

In [None]:
# Convert the "order_date" column to a datetime datatype.
sales['order_date'] = pd.to_datetime(sales['order_date'], format='%m/%d/%y %H:%M')




In [None]:
# Confirm that the data type has been changed.
sales['order_date'].head()

### 1. Which top five zip codes in which cities and states had the greatest number of products ordered?

### Using `groupby`

In [None]:
# Show the average number items ordered for each state, city, and zip code.
def agg_values(df, cat_col, num_col, stat):
    """
    Aggregates data based on categorical columns and numerical columns using a specified aggregation function.

    Args:
        df (DataFrame): The input DataFrame.
        cat_col (str or list): Categorical column(s) to group by.
        num_col (str): Numerical column to aggregate.
        stat (str or dict): Type of aggregation. Can be a string representing a common aggregation function 
                            (e.g., 'mean', 'sum') or a dictionary specifying column-specific aggregation functions.

    Returns:
        DataFrame: Aggregated DataFrame with categorical columns and the specified aggregated numerical column.

    Example:
        df = pd.DataFrame({'Category': ['A', 'B', 'A', 'B'],
                           'Value': [10, 20, 30, 40]})
        result = agg_values(df, 'Category', 'Value', 'sum')
    """
    # Group by the specified categorical column(s) and aggregate the numerical column using the specified aggregation function
    aggs = df.groupby(cat_col).agg({num_col: stat})

    # Sort the aggregated DataFrame by the numerical column in descending order
    aggs_sorted = aggs.sort_values(num_col, ascending=False)

    # Reset the index to make the resulting DataFrame more readable
    return aggs_sorted


avg_items_ordered = agg_values(sales, ['state','city','zip_code'],'quantity_ordered','mean')
# Rename the sum column to reflect the data in the column.
avg_items_ordered.rename(columns={'quantity_ordered':'avg_items_ordered'},inplace= True)


# Show the top 5 results.
round(avg_items_ordered,2).head()



### Using `pivot_table`

In [None]:
# Show the total number of items ordered for each state, city, and zip code.
def agg_pivot(df, cat_col, num_col, stat):
    """
    Creates a pivot table by aggregating data based on categorical columns and numerical columns.

    Args:
        df (DataFrame): The input DataFrame.
        cat_col (str or list): Categorical column(s) to be used as index in the pivot table.
        num_col (str): Numerical column to aggregate.
        stat (str or dict): Type of aggregation. Can be a string representing a common aggregation function 
                            (e.g., 'mean', 'sum') or a dictionary specifying column-specific aggregation functions.

    Returns:
        DataFrame: Pivot table with categorical columns as index and the specified aggregated numerical column.

    Example:
        df = pd.DataFrame({'Category': ['A', 'B', 'A', 'B'],
                           'Value': [10, 20, 30, 40]})
        result = agg_pivot(df, 'Category', 'Value', 'sum')
    """
    # Create a pivot table using the specified categorical column(s) as index, numerical column for values, and aggregation function
    pivot_table = pd.pivot_table(df, values=num_col, index=cat_col, aggfunc=stat)

    # Sort the pivot table by the aggregated numerical column in descending order
    sorted_pivot_table = pivot_table.sort_values(num_col, ascending=False)

    # Reset the index to make the resulting DataFrame more readable
    return sorted_pivot_table

#---------------------------------------------------------------------------------------------------

items_ordered_pivot = agg_pivot(sales,['state','city','zip_code'],'quantity_ordered','sum')


# Rename the "sum" column to "Avg_Number_Products_Ordered"

items_ordered_pivot.rename(columns={'quantity_ordered':'total_items_ordered'},inplace= True)

# Show the top 5 results.
items_ordered_pivot.head()


### 2. Which top five zip codes in which cities and states generated the most sales?

### Using `groupby`

In [None]:
# Show the total price for the items ordered for each state, city, and zip code.
sales['total_price'] = sales['price']* sales['quantity_ordered']

total_sales_price = agg_values(sales,['state','city','zip_code'],'total_price','sum')

#Format price
total_sales_price['total_price'] = total_sales_price['total_price'].map('${:,.2f}'.format)

# Rename the sum column to reflect the data in the column.
total_sales_price.rename(columns={'total_price':'total_sales_price'},inplace= True)

# Show the top 5 results.

total_sales_price.head()


### Using `pivot_table`

In [None]:
# Show the total price for all of the items ordered for each state, city, and zip code.
total_sales_pivot = agg_pivot(sales,['state','city','zip_code'],'total_price','sum')

# Rename the "sum" column to "Avg_Number_Products_Ordered"

total_sales_pivot['total_price'] = total_sales_pivot['total_price'].map('${:,.2f}'.format)


# Show the top 5 results.
total_sales_pivot.head()


### 3. Which top five zip codes in which cities and states had the greatest average price for the products ordered?

### Using `groupby`

In [None]:
# Show the average price of items ordered for each state, city, and zip code rounded to two decimal places.
avg_price = agg_values(sales,['state','city','zip_code'],'price','mean')

# Rename the mean column to reflect the data in the column.
avg_price.rename(columns={'price':'avg_price_per_product'},inplace= True)

#Round and format price
avg_price['avg_price_per_product'] = round(avg_price['avg_price_per_product'],2)
avg_price['avg_price_per_product'] = avg_price['avg_price_per_product'].map('${:,.2f}'.format)

# Show the top 5 results.
avg_price.head()

### Using `pivot_table`

In [None]:
# Show the average price per product ordered for each state, city, and zip code rounded to two decimal places.

avg_price_pivot = agg_pivot(sales,['state','city','zip_code'],'price','mean')

# Rename the "sum" column to "Avg_Number_Products_Ordered"
avg_price_pivot['price'] = avg_price_pivot['price'].map('${:,.2f}'.format)
avg_price_pivot.rename(columns={'price':'avg_price_per_product'},inplace= True)

# Show the top 5 results.
avg_price_pivot.head()



### 4. Which top five zip codes in which cities and states had the most iPhone sales?

In [None]:
# Filter the sales data to get the iPhone sales data.
iphone_sales = sales[sales['product'].str.contains('iphone',case=False)]

#print head
iphone_sales.head(5)

### Using `groupby`

In [None]:
# Show the total number of items ordered for each state, city, and zip code.
total_iphone_ordered = agg_values(iphone_sales,['state','city','zip_code'],'quantity_ordered','sum')

# Rename the count column to reflect the data in the column.
total_iphone_ordered.rename(columns={'quantity_ordered':'total_iphone_ordered'},inplace=True)

#Show the top 5 results.
total_iphone_ordered.head()

### Using `pivot_table`

In [None]:
# Show the total number of items ordered for each city and zip code.
total_iphone_ordered_pivot = agg_pivot(iphone_sales,['state','city','zip_code'],'quantity_ordered','sum')

# Rename the "quantity_ordered" column to "Number_Ordered"
total_iphone_ordered_pivot.rename(columns={'quantity_ordered':'total_iphone_ordered'},inplace=True)

# Show the top 5 results.
total_iphone_ordered_pivot.head()

### 5. Which day had the most iPhone sales?

In [None]:
# Create a pivot table with the 'order_date' as the index and the "price" as the values.
sales_iphone_pivot = iphone_sales[['order_date','price']].set_index('order_date')
sales_iphone_pivot

# Rename the "price" column to "Total_Sales"
sales_iphone_pivot.rename(columns={'price':'Total_sales'},inplace=True)

#Show the table.
sales_iphone_pivot.head()

In [None]:
# Resample the pivot table into daily bins 
daily_iphone_sales = sales_iphone_pivot.resample('D')

# aggrigate to daily sales
daily_total_sales  = daily_iphone_sales.sum()

#top 5 
daily_total_sales.sort_values('Total_sales',ascending= False).head()

### 6.  Which week had the most iPhone sales?

In [None]:
# Resample the pivot table into weekly bins 
# and get the total sales for each week rounded to two decimal places.
weekly_iphone_sales = sales_iphone_pivot.resample('W')

# aggrigate to daily sales
weekly_total_iphone_sales = weekly_iphone_sales.sum()

#top 5 
weekly_total_iphone_sales.sort_values('Total_sales',ascending= False).head()