# Working examples with Dataframes

## Import the dataset and libraries

Linux

In [1]:
# Run this cell if you are in a linux system
import os
import pandas as pd

# Get the absolute path to the current notebook
os_path = os.getcwd()
# Add the extra path to the dataset file
dataset_path = os_path+'/datasets/Online_Retail.csv'
retail_data = pd.read_csv(dataset_path, encoding='ISO-8859-1')
retail_data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/11 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/11 12:50,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/11 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/11 12:50,4.15,12680.0,France


Windows

In [15]:
# Run this cell if you are in Windows
import os
import pandas as pd

# Get the absolute path to the current notebook
os_path = os.getcwd()
# Add the extra path to the dataset file
dataset_path = os_path+'\datasets\Online_Retail.csv'
retail_data = pd.read_csv(dataset_path, encoding='ISO-8859-1')
retail_data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/11 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/11 12:50,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/11 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/11 12:50,4.15,12680.0,France


## Example 1: Categorize data adding new columns

### 1.1 Load or define the datase

In [16]:
data = {
    "Name": ["Harry", "Draco", "Cedric", "Ginny"],
    "Year": [3, 3, 5, 1],
    "House": ["Gryffindor", "Slytherin", "Hufflepuff", "Gryffindor"],
}
print(data)

{'Name': ['Harry', 'Draco', 'Cedric', 'Ginny'], 'Year': [3, 3, 5, 1], 'House': ['Gryffindor', 'Slytherin', 'Hufflepuff', 'Gryffindor']}


### 1.2 Covert to Dataframe pandas datastructure

In [17]:
df_data = pd.DataFrame(data)
df_data

Unnamed: 0,Name,Year,House
0,Harry,3,Gryffindor
1,Draco,3,Slytherin
2,Cedric,5,Hufflepuff
3,Ginny,1,Gryffindor


### 1.3 Add a new category or column called 'GradeLevel'

In [18]:
def category_level(year):
    if year == 1:
        return "Freshman"
    elif year == 2:
        return "Sophomore"
    elif year == 3 or year == 4:
        return "Junior"
    elif year > 4:
        return "Senior"
    else:
        return "Graduated"


df_data["GradeLevel"] = df_data["Year"].apply(category_level)

df_data

Unnamed: 0,Name,Year,House,GradeLevel
0,Harry,3,Gryffindor,Junior
1,Draco,3,Slytherin,Junior
2,Cedric,5,Hufflepuff,Senior
3,Ginny,1,Gryffindor,Freshman


## Example 2: Find data only of countries start with M using `loc`

In [19]:
m_countries_unit_prices = retail_data.loc[
    retail_data["Country"].str.startswith("M"),
    ["Description", "Quantity", "UnitPrice", "Country"],
]
m_countries_unit_prices

Unnamed: 0,Description,Quantity,UnitPrice,Country
217684,LANTERN CREAM GAZEBO,3,4.95,Malta
217685,WHITE HANGING HEART T-LIGHT HOLDER,6,2.95,Malta
217686,RED HANGING HEART T-LIGHT HOLDER,6,2.95,Malta
217687,SET OF 6 GIRLS CELEBRATION CANDLES,12,1.25,Malta
217688,BEST DAD CANDLE LETTERS,6,0.85,Malta
...,...,...,...,...
516550,FAMILY PHOTO FRAME CORNICE,-2,9.95,Malta
516551,3 ROSE MORRIS BOXED CANDLES,-1,1.25,Malta
516552,SET/3 VANILLA SCENTED CANDLE IN BOX,-1,4.25,Malta
516553,GOLD MUG BONE CHINA TREE OF LIFE,-1,1.06,Malta


## Example 3: Find the top 3 countries with the highest and the lowest sales

### 3.1 Top countries with most sales

In [6]:
def total_sales(group):
    return (group['Quantity'] * group['UnitPrice']).sum()

higher_3 = retail_data.groupby('Country').apply(total_sales).sort_values(ascending=False)
higher_3.head(3)

  higher_3 = retail_data.groupby('Country').apply(total_sales).sort_values(ascending=False)


Country
United Kingdom    8187806.364
Netherlands        284661.540
EIRE               263276.820
dtype: float64

### 3.2 Top countries with lowest sales

In [7]:
lower_3 = retail_data.groupby('Country').apply(total_sales).sort_values()
lower_3.head(3)

  lower_3 = retail_data.groupby('Country').apply(total_sales).sort_values()


Country
Saudi Arabia      131.17
Bahrain           548.40
Czech Republic    707.72
dtype: float64