# 1. IMPORT AND EXPLORE DATASET

In [None]:
import pandas as pd

In [None]:
# Import dataset using Pandas
# Link to Dataset: https://www.kaggle.com/carrie1/ecommerce-data
# Data contains transactions details between 01/12/2010 and 09/12/2011 for a UK-based non-store online retail.
# The company specializes in selling unique gifts
sales_df = pd.read_csv('ecommerce_sales.csv', encoding = 'unicode_escape')
sales_df

In [None]:
# Let's view the types of data
# Note that InvoiceDate is in object format, we will need to convert it into Datetime format
sales_df.info()

In [None]:
# Convert Invoice date to datetime format
sales_df['InvoiceDate'] = pd.to_datetime(sales_df['InvoiceDate'])


In [None]:
# Check datatype again to confirm!
sales_df.info()

In [None]:
# Check the number of Null values in the data
sales_df.isnull().sum()

**MINI CHALLENGE #1:**
- **How many unique countries are present in the dataset? List all countries**

# 2. GROUPBY

In [None]:
sales_df

In [None]:
# A groupby operation involves some combination of splitting the object, applying a function, and combining the results. 
# This can be used to group large amounts of data and compute operations on these groups.
# Link: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html
sales_df.groupby('Country')['UnitPrice'].mean()

In [None]:
sales_df.groupby('Country')['UnitPrice'].min()

In [None]:
sales_df.groupby('Country')['UnitPrice'].max()

In [None]:
sales_df.groupby('InvoiceDate')['UnitPrice'].mean()

In [None]:
sales_df.groupby(['Country', 'InvoiceDate'])['UnitPrice'].mean()

**MINI CHALLENGE #2:**
- **What is the maximum and minimum prices at 2011-12-09 12:25:00**

# 3. CREATE MULTI-INDEX DATAFRAME

In [None]:
sales_df = pd.read_csv('ecommerce_sales.csv', encoding = 'unicode_escape')
sales_df

In [None]:
# You can select any column to be the index for the DataFrame
# Use one column only as follows:
sales_df.set_index(keys = ["InvoiceDate"], inplace = True)
sales_df

In [None]:
# Let's see how many unique countries are present in the dataframe
sales_df['Country'].unique()

In [None]:
# Alternatively, We can have multiple keys (indexes) using Pandas Multi-indexing
# Take the columns with the least number of unique values and use it for the outermost index

# Let's import the dataset again using Pandas
sales_df = pd.read_csv('ecommerce_sales.csv', encoding = 'unicode_escape')
sales_df.set_index(keys = ["Country", "InvoiceDate"], inplace = True)
sales_df

In [None]:
# Sort countries to start with alphabetical order
sales_df.sort_index(inplace = True)
sales_df

In [None]:
# Now you need more than one index to access any element
sales_df.index

In [None]:
sales_df.index.names

In [None]:
# Multiindex objects
type(sales_df.index)

In [None]:
# It gives out the datetime and the country as well
sales_df.index[0]

**MINI CHALLENGE #3:**
- **Sort the DataFrame in a descending order (countries and dates)**

# 4. MULTI-INDEXING OPERATIONS - PART #1

In [None]:
sales_df = pd.read_csv('ecommerce_sales.csv', encoding = 'unicode_escape')
sales_df.set_index(keys = ["Country", "InvoiceDate"], inplace = True)
sales_df

In [None]:
sales_df.index.get_level_values(0)

In [None]:
sales_df.index.get_level_values(1)

In [None]:
sales_df.index.get_level_values("Country")


In [None]:
sales_df.index.get_level_values("InvoiceDate")


In [None]:
# You can change the names of the DataFrame by invoking the set_names method
sales_df.index.set_names(names = ['Transaction Date', 'Transaction Location'], inplace = True)
sales_df

**MINI CHALLENGE #4:**
- **Use InvoiceDate and Country in order as the multi-index**

# 5. MULTI-INDEXING OPERATIONS - PART #2

In [None]:
sales_df = pd.read_csv('ecommerce_sales.csv', encoding = 'unicode_escape')
sales_df.set_index(keys = ["Country", "InvoiceDate"], inplace = True)
# Sort countries to start with alphabetical order
sales_df.sort_index(inplace = True)
sales_df

In [None]:
# you can use a multi-index reference to access specific elements 
# Alternatively, you can use a column name instead 
sales_df.loc[ "Australia", "1/11/2011 9:47"]

In [None]:
# feed index as a tuple (important to avoid confusion)
# first argument references rows and the second argument references a column
sales_df.loc[("Australia", "1/11/2011 9:47"), "UnitPrice"]

In [None]:
sales_df

In [None]:
# no issues with numeric indexing, you can use one index so no tuples are required
sales_df.iloc[0]


In [None]:
# no issues with numeric indexing
sales_df.iloc[0, 0]

In [None]:
# You can use transpose to Transpose indexes and columns
# reflect the DataFrame over its main diagonal by writing rows as columns and vice-versa. 
sales_df = sales_df.transpose()
sales_df.head(10)


In [None]:
sales_df.loc['UnitPrice', ('Australia', '1/10/2011 9:58')]

In [None]:
sales_df.loc['UnitPrice', ('Australia', '1/10/2011 9:58'):('Belgium', '1/10/2011 9:58')]

In [None]:
sales_df.loc[('UnitPrice',), ('Australia', '1/10/2011 9:58'):('Belgium', '1/10/2011 9:58')]

In [None]:
# Let's import the dataset again using Pandas
sales_df = pd.read_csv('ecommerce_sales.csv', encoding = 'unicode_escape')
sales_df.set_index(keys = ["Country", "InvoiceDate"], inplace = True)

In [None]:
sales_df

In [None]:
# You can perform swaplevel as follows:
sales_df = sales_df.swaplevel()
sales_df

In [None]:
# Perform swaplevel again:
sales_df = sales_df.swaplevel()
sales_df

In [None]:
# Please note that there is no point of adding [False, False] or [True, True]
sales_df.sort_index(ascending = [True, False], inplace = True)

In [None]:
sales_df

In [None]:
# only sort by that given level and ignore the others
sales_df.sort_index(level = 0)

In [None]:
# Moved columns into rows and now it converts the dataframe into series (it's not multi-dimensional anymore)
sales_df = sales_df.stack()
sales_df

In [None]:
# Confirm the Datatype "Series"
type(sales_df)

In [None]:
# Convert it into dataframe
sales_df.to_frame()

**MINI CHALLENGE #5:**
- **Calculate the average unit price for transactions occured in "United Kingdom" at "12/1/2010 8:26"**

# MINI CHALLENGE SOLUTIONS

**MINI CHALLENGE #1 SOLUTION:**
- **How many unique countries are present in the dataset? List all countries**

In [None]:
sales_df['Country'].unique()

In [None]:
# Obtain the number of unique values in each column
sales_df.nunique()

**MINI CHALLENGE #2 SOLUTION:**
- **What is the maximum and minimum prices at 2011-12-09 12:25:00**

In [None]:
# Min = 0.72 and Max = 1.85
sales_df.groupby('InvoiceDate')['UnitPrice'].min()

In [None]:
sales_df.groupby('InvoiceDate')['UnitPrice'].max()

**MINI CHALLENGE #3 SOLUTION:**
- **Sort the DataFrame in a descending order (countries and dates)**

In [None]:
# Please note that there is no point of adding [False, False] or [True, True]
sales_df.sort_index(ascending = False, inplace = True)
sales_df

**MINI CHALLENGE #4 SOLUTION:**
- **Use InvoiceDate and Country in order as the multi-index**

In [None]:
sales_df = pd.read_csv('ecommerce_sales.csv', encoding = 'unicode_escape')
sales_df.set_index(keys = ["InvoiceDate", "Country"], inplace = True)
sales_df

**MINI CHALLENGE #5 SOLUTION:**
- **Calculate the average unit price for transactions occured in "United Kingdom" at "12/1/2010 8:26"**

In [None]:
# Let's import the dataset again using Pandas
sales_df = pd.read_csv('ecommerce_sales.csv', encoding = 'unicode_escape')
sales_df.set_index(keys = ["Country", "InvoiceDate"], inplace = True)


print(sales_df.loc[("United Kingdom", "12/1/2010 8:26"), "UnitPrice"])

# feed index as a tuple (important to avoid confusion)
# first argument references rows and the second argument references a column
sales_df.loc[("United Kingdom", "12/1/2010 8:26"), "UnitPrice"].mean()
