# Pandas Useful Methods

## Import the dataset

In [1]:
import pandas as pd
import os

# Get the absolute path to the current notebook
os_path = os.getcwd()
# get absolute path + dataset path
datapath = os_path + '\datasets\Online_Retail.csv'
sales_data = pd.read_csv(datapath, encoding='ISO-8859-1')
print(sales_data.head())

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

    InvoiceDate  UnitPrice  CustomerID         Country  
0  12/1/10 8:26       2.55     17850.0  United Kingdom  
1  12/1/10 8:26       3.39     17850.0  United Kingdom  
2  12/1/10 8:26       2.75     17850.0  United Kingdom  
3  12/1/10 8:26       3.39     17850.0  United Kingdom  
4  12/1/10 8:26       3.39     17850.0  United Kingdom  


## Useful methods for DataFrames

### Get the number of columns

In [5]:
column_names = sales_data.columns
column_names

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

### Get the number of rows and columns

Use `shape` to get the number of rows and columns

In [8]:
sales_rows, sales_cols = sales_data.shape
print(f'Number of rows: {sales_rows}')
print(f'Number of columns: {sales_cols}')

Number of rows: 541909
Number of columns: 8


### Get a specific column

In [13]:
col_quantity = sales_data['Quantity']
col_quantity

0          6
1          6
2          8
3          6
4          6
          ..
541904    12
541905     6
541906     4
541907     4
541908     3
Name: Quantity, Length: 541909, dtype: int64

Get a value in a specific row

In [14]:
print(col_quantity[2])

8


### Get the a summary of the main statistic values

In [15]:
summary_sales = sales_data.describe()
summary_sales

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


### Head method

The `.head([n])` method gets the first `n` rows of our Dataframe (data table) or Series (data row). If `n` is omitted the the default value is 5.

In [16]:
sales_data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850.0,United Kingdom


In [17]:
col_quantity.head(8)

0    6
1    6
2    8
3    6
4    6
5    2
6    6
7    6
Name: Quantity, dtype: int64

### Count method

Notice that the `count` method only counts the values that are non-empty or different to null or None.

In [26]:
sales_data.count()

InvoiceNo      541909
StockCode      541909
Description    540455
Quantity       541909
InvoiceDate    541909
UnitPrice      541909
CustomerID     406829
Country        541909
dtype: int64

## Find indices with null or empty rows

In [19]:
# Test dataframes
df_A = pd.DataFrame({'Num': [4, None, 9, None, 77]})
df_B = pd.DataFrame({'Num': [44, None, 6, 22, None]})

print(f'df_A:\n {df_A}')
print(f'df_B:\n {df_B}')

df_A:
     Num
0   4.0
1   NaN
2   9.0
3   NaN
4  77.0
df_B:
     Num
0  44.0
1   NaN
2   6.0
3  22.0
4   NaN


In [24]:
# Get the rows with null or empty values
null_A = df_A[df_A.Num.isna() == True].index
null_B = df_B[df_B.Num.isna() == True].index
print(f'null_A:\n {null_A}')
print(f'null_B:\n {null_B}')

print(20*'-')

# Get the where those values match for every column
print(null_A.intersection(null_B))

null_A:
 Index([1, 3], dtype='int64')
null_B:
 Index([1, 4], dtype='int64')
--------------------
Index([1], dtype='int64')
