In [2]:
# Create an alias with the as keyword while importing
# Now you can refer to the Pandas package as pd instead of pandas
import pandas as pd

In [4]:
# Run this cell for preparation
# Load the CSV into a dataframe
sales = pd.read_csv('./sales_data.csv')
# Clean the sales data
# Step 1: Remove duplicated rows
sales.drop_duplicates(inplace = True) # We set inplace = True to make sure that the method does NOT return a new dataframe, but it will remove all duplicates from the original dataframe
# Step 2: Correct wrong values
sales.loc[sales['Product_ID'] == 'P001', 'Price'] = 20.99
sales['TransactionAmount'] = sales['Quantity'] * sales['Price']
# Step 3: Clean null values
weighted_price = sales['TransactionAmount'].sum() / sales['Quantity'].sum()
sales['Price'] = sales['Price'].fillna(weighted_price)
# Step 4: Recalculate Transaction Amount
sales['TransactionAmount'] = sales['Quantity'] * sales['Price']

In [3]:
# Load the CSV into a dataframe
customers = pd.read_csv('./customers_data.csv')
print(customers)

  Customer_ID     Name                Email           City
0        C101    Alice    alice@example.com       New York
1        C102      Bob      bob@example.com    Los Angeles
2        C103  Charlie  charlie@example.com        Chicago
3        C104    David    david@example.com        Houston
4        C105      Eve      eve@example.com          Miami
5        C106    Frank    frank@example.com  San Francisco
6        C107    Grace    grace@example.com        Seattle


**Practice 1: Write code to answer the following question: for each product, what is its average price weighted by transaction quantity?**

In [13]:
prod = sales.groupby('Product_ID')[['TransactionAmount','Quantity']].sum()
prod['weighted_price'] = prod['TransactionAmount'] / prod['Quantity']
print(prod)

            TransactionAmount  Quantity  weighted_price
Product_ID                                             
P001               125.940000         6       20.990000
P002                89.970000         3       29.990000
P003                29.970000         3        9.990000
P004                18.276429         1       18.276429
P005                 9.990000         1        9.990000


**Practice 2: Write code to count the number of transactions for each pair of customer and product**

In [11]:
sales.groupby('Customer_ID')['Product_ID'].count() # to get the proportion

Customer_ID
C101    2
C102    3
C103    2
C105    1
C106    1
Name: Product_ID, dtype: int64

**Practice 3: Use .nlargest(), .nsmallest(), .idxmax(), and .idxmin() to write code to get the product with the highest and lowest average price weighted by transaction quantity**

**Hint 1: Use your answer from Practice 2**

**Hint 2: Read resources here: https://tutorialsinhand.com/Articles/pandas-dataframe---nsmallest-and-nlargest.aspx and https://proclusacademy.com/blog/quicktip/pandas-idxmin-idxmax/**

**The second resource also talks about having repeated max and min values**

In [18]:
prod.nlargest(1, 'weighted_price')

Unnamed: 0_level_0,TransactionAmount,Quantity,weighted_price
Product_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P002,89.97,3,29.99


In [19]:
prod.nsmallest(1, 'weighted_price')

Unnamed: 0_level_0,TransactionAmount,Quantity,weighted_price
Product_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P003,29.97,3,9.99


In [20]:
prod.idxmax()['weighted_price']

'P002'

In [21]:
prod.idxmin()['weighted_price']

'P003'

**Practice 4: Get the rows in the sales table where the customer's city is Chicago**

In [7]:
sales[sales["Customer_ID"].isin(customers.loc[customers['City'] == 'Chicago', 'Customer_ID'])]

Unnamed: 0,Transaction_ID,Customer_ID,Product_ID,Quantity,Price,Date,TransactionAmount
2,3,C103,P001,1,20.99,1/7/2023,20.99
8,9,C103,P005,1,9.99,1/13/2023,9.99
