##### <b> Accssing Dataframes </b></br> Can access by using bracket notation or dot notation

In [1]:
import pandas as pd
import numpy as np

In [2]:
path_retail = 'Pandas Course Resources/retail/retail_2016_2017.csv'
retail_df = pd.read_csv(path_retail)

retail_df.head(10)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,1945944,2016-01-01,1,AUTOMOTIVE,0.0,0
1,1945945,2016-01-01,1,BABY CARE,0.0,0
2,1945946,2016-01-01,1,BEAUTY,0.0,0
3,1945947,2016-01-01,1,BEVERAGES,0.0,0
4,1945948,2016-01-01,1,BOOKS,0.0,0
5,1945949,2016-01-01,1,BREAD/BAKERY,0.0,0
6,1945950,2016-01-01,1,CELEBRATION,0.0,0
7,1945951,2016-01-01,1,CLEANING,0.0,0
8,1945952,2016-01-01,1,DAIRY,0.0,0
9,1945953,2016-01-01,1,DELI,0.0,0


##### <b> Bracket Notation </b></br>

In [3]:
# using the column name, or list [['', '']] of column names as a mask for the DataFrame to display data
retail_df['family']

0                          AUTOMOTIVE
1                           BABY CARE
2                              BEAUTY
3                           BEVERAGES
4                               BOOKS
                      ...            
1054939                       POULTRY
1054940                PREPARED FOODS
1054941                       PRODUCE
1054942    SCHOOL AND OFFICE SUPPLIES
1054943                       SEAFOOD
Name: family, Length: 1054944, dtype: object

##### <b> Dot Notation </b></br> - Only works with valid Python variable names and if column name is not already an existing variable/method in the code

In [4]:
# dot notation
retail_df.family

0                          AUTOMOTIVE
1                           BABY CARE
2                              BEAUTY
3                           BEVERAGES
4                               BOOKS
                      ...            
1054939                       POULTRY
1054940                PREPARED FOODS
1054941                       PRODUCE
1054942    SCHOOL AND OFFICE SUPPLIES
1054943                       SEAFOOD
Name: family, Length: 1054944, dtype: object

##### <b> Use Series Operations on DataFrame Columns </b></br>

In [5]:
# number of unique values in a column
retail_df['family'].nunique()

33

In [6]:
# list of unique values in a column as a Series, default would be an array
pd.Series(retail_df['family'].unique())

0                     AUTOMOTIVE
1                      BABY CARE
2                         BEAUTY
3                      BEVERAGES
4                          BOOKS
5                   BREAD/BAKERY
6                    CELEBRATION
7                       CLEANING
8                          DAIRY
9                           DELI
10                          EGGS
11                  FROZEN FOODS
12                     GROCERY I
13                    GROCERY II
14                      HARDWARE
15            HOME AND KITCHEN I
16           HOME AND KITCHEN II
17               HOME APPLIANCES
18                     HOME CARE
19                    LADIESWEAR
20               LAWN AND GARDEN
21                      LINGERIE
22              LIQUOR,WINE,BEER
23                     MAGAZINES
24                         MEATS
25                 PERSONAL CARE
26                  PET SUPPLIES
27       PLAYERS AND ELECTRONICS
28                       POULTRY
29                PREPARED FOODS
30        

In [7]:
# first 5 unique values in a column using iloc[:stop]
retail_df['family'].value_counts().iloc[:5]

family
AUTOMOTIVE                    31968
HOME APPLIANCES               31968
SCHOOL AND OFFICE SUPPLIES    31968
PRODUCE                       31968
PREPARED FOODS                31968
Name: count, dtype: int64

In [8]:
# mean of the sales column
retail_df['sales'].mean().round(2)

457.72

In [9]:
# sum of all hte values in the sales column (rounded)
retail_df['sales'].sum().round(2)

482871591.33

In [10]:
# select multiple columns using a list [['', '']] which is great for non-consecutive columns and output as a DataFrame
retail_df[['family', 'store_nbr']]

Unnamed: 0,family,store_nbr
0,AUTOMOTIVE,1
1,BABY CARE,1
2,BEAUTY,1
3,BEVERAGES,1
4,BOOKS,1
...,...,...
1054939,POULTRY,9
1054940,PREPARED FOODS,9
1054941,PRODUCE,9
1054942,SCHOOL AND OFFICE SUPPLIES,9


In [11]:
# Can also slice the multiple columns selection DataFrame using .iloc[]

retail_df[['family', 'store_nbr']].iloc[:5]

Unnamed: 0,family,store_nbr
0,AUTOMOTIVE,1
1,BABY CARE,1
2,BEAUTY,1
3,BEVERAGES,1
4,BOOKS,1


##### <b> Accessing Data with .iloc[] </b></br> Access DataFrames by the row and column indices </br> iloc[`row_start`:`row_stop`, `column_start`:`column_stop`]

In [12]:
# grab first 5 rows (row index 1 to 4) and columns 2 to 4 (column index 1,2,3)

retail_df.iloc[:5, 1:4]

Unnamed: 0,date,store_nbr,family
0,2016-01-01,1,AUTOMOTIVE
1,2016-01-01,1,BABY CARE
2,2016-01-01,1,BEAUTY
3,2016-01-01,1,BEVERAGES
4,2016-01-01,1,BOOKS


##### <b> Accessing Data with .loc[] </b></br> Access DataFrames by the row and column labels </br> iloc[`row_start`:`row_stop`, `column_start`:`column_stop`]

In [13]:
# using .loc for with all rows and a single column without [] around column name will return a series
retail_df.loc[:, 'date']

0          2016-01-01
1          2016-01-01
2          2016-01-01
3          2016-01-01
4          2016-01-01
              ...    
1054939    2017-08-15
1054940    2017-08-15
1054941    2017-08-15
1054942    2017-08-15
1054943    2017-08-15
Name: date, Length: 1054944, dtype: object

In [14]:
# using .loc for with all rows and a single column with [] around column name will return a DataFrame
retail_df.loc[:, ['date']]

Unnamed: 0,date
0,2016-01-01
1,2016-01-01
2,2016-01-01
3,2016-01-01
4,2016-01-01
...,...
1054939,2017-08-15
1054940,2017-08-15
1054941,2017-08-15
1054942,2017-08-15


In [15]:
# using .loc[] accessor to access all rows and a list of columns
retail_df.loc[:, ['date', 'sales']]

Unnamed: 0,date,sales
0,2016-01-01,0.000
1,2016-01-01,0.000
2,2016-01-01,0.000
3,2016-01-01,0.000
4,2016-01-01,0.000
...,...,...
1054939,2017-08-15,438.133
1054940,2017-08-15,154.553
1054941,2017-08-15,2419.729
1054942,2017-08-15,121.000


In [16]:
# using .loc[] accessor to access all rows and a slice of columns
retail_df.loc[:, 'date':'sales']

Unnamed: 0,date,store_nbr,family,sales
0,2016-01-01,1,AUTOMOTIVE,0.000
1,2016-01-01,1,BABY CARE,0.000
2,2016-01-01,1,BEAUTY,0.000
3,2016-01-01,1,BEVERAGES,0.000
4,2016-01-01,1,BOOKS,0.000
...,...,...,...,...
1054939,2017-08-15,9,POULTRY,438.133
1054940,2017-08-15,9,PREPARED FOODS,154.553
1054941,2017-08-15,9,PRODUCE,2419.729
1054942,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000


In [17]:
# using .loc[] accessor to access all 10 rows (loc is stop inclusive) and a slice of columns
retail_df.loc[:10, 'date':'sales']

Unnamed: 0,date,store_nbr,family,sales
0,2016-01-01,1,AUTOMOTIVE,0.0
1,2016-01-01,1,BABY CARE,0.0
2,2016-01-01,1,BEAUTY,0.0
3,2016-01-01,1,BEVERAGES,0.0
4,2016-01-01,1,BOOKS,0.0
5,2016-01-01,1,BREAD/BAKERY,0.0
6,2016-01-01,1,CELEBRATION,0.0
7,2016-01-01,1,CLEANING,0.0
8,2016-01-01,1,DAIRY,0.0
9,2016-01-01,1,DELI,0.0


##### <b> Dropping Rows and Columns </b></br> .drop() method drops rows and columns from a DataFrame </br> - `axis=0` to drop rows: typically drop rows via slicing or filtering instead of .drop() method </br> - `axis=1` to drop columns </br> - `inplace=True` permanently removes rows/columns from DataFrame: Typically better to save into a new DataFrame so the original DataFrame is still available


In [18]:
# list of column names
retail_df.columns

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion'], dtype='object')

In [19]:
# this returns first 5 rows of data from DataFrame with the 'id' column as it's redundant as pandas index
retail_df.drop('id', axis=1).head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion
0,2016-01-01,1,AUTOMOTIVE,0.0,0
1,2016-01-01,1,BABY CARE,0.0,0
2,2016-01-01,1,BEAUTY,0.0,0
3,2016-01-01,1,BEVERAGES,0.0,0
4,2016-01-01,1,BOOKS,0.0,0


In [20]:
# permanently remove 'id' and 'onpromotion' from DataFrame
retail_df.drop(['id', 'onpromotion'], axis=1, inplace=True)
retail_df.head()

Unnamed: 0,date,store_nbr,family,sales
0,2016-01-01,1,AUTOMOTIVE,0.0
1,2016-01-01,1,BABY CARE,0.0
2,2016-01-01,1,BEAUTY,0.0
3,2016-01-01,1,BEVERAGES,0.0
4,2016-01-01,1,BOOKS,0.0


##### <b> Identifying Duplicate Row </b></br> .duplicated() method identifies duplicate rows of data - This means every column of that row is duplicated exactly in another row </br> - specify `subset=column(s)` to look for duplicates across a subset of columns (so only duplicate values in the rows of that column(s)) </br> if number of unique values (`.nunique()`) is less than the total number of rows, then that column contains duplicate values


In [21]:
# create duplicated value DataFrame using dictionary {key:values}
products_df = pd.DataFrame(
    {'product': ['Dairy', 'Dairy', 'Dairy', 'Vegetable', 'Fruits'],
    'price': [2.56, 2.56, 4.55, 2.74, 5.44]        
    }
)
products_df

Unnamed: 0,product,price
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetable,2.74
4,Fruits,5.44


In [23]:
# shape of DataFrame
products_df.shape

(5, 2)

In [24]:
# Number of Unique Values in each DataFrame Columns
# number of uniques values (produce: 3, price: 4) is less than number of rows (5)
products_df.nunique()


product    3
price      4
dtype: int64

In [26]:
# the .duplicated() method returns True for the second row because it is an exact duplicate of the first row
products_df.duplicated()

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [27]:
# to find a duplicate value in a specific column use (subset=) argument
products_df.duplicated(subset='product')

0    False
1     True
2     True
3    False
4    False
dtype: bool

##### <b>Drop Duplicate Rows </b></br> .drop_duplicates() method drops duplicate rows (where there are rows that are exact duplicates of each other) </br> - specify `subset=column(s)` to drop duplicate rows based on specfic column(s)</br> Passable arguments: - `keep=`('first, 'last', False): first is default, False drops all duplicates </br> - `inplace=`(True, False): False is default, True removes on current DataFrame, False returns a DataFrame copy with removing done </br> - `ignore_index=`(True, False): False is default, True re-indexes the resulting DataFrame at 0


In [32]:
# remove duplicates in 'product' column, keep the last of the duplicates (not the first), ignore_index resets index, and this is done as a copy, not to the original DataFrame (that requires inplace=True)
products_df.drop_duplicates(subset='product', keep='last', ignore_index=True)

Unnamed: 0,product,price
0,Dairy,4.55
1,Vegetable,2.74
2,Fruits,5.44


In [34]:
# orginal DataFrame still contains all rows
products_df

Unnamed: 0,product,price
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetable,2.74
4,Fruits,5.44
