# Working with Pandas for CSV files

In [1]:
import pandas as pd

In [2]:
# Let's take the data from Sololearn
us_president = pd.read_csv('https://sololearn.com/uploads/files/president_heights_party.csv', index_col='name')

# Too many data, decrease it
print("DATASET")
print(us_president[:5], "\n")
print("DF Shape =", us_president.shape)
print("DF Size =", us_president.size)
print("DF Rows =", us_president.shape[0], "& DF Column =", us_president.shape[1])

DATASET
                   order  age  height                  party
name                                                        
George Washington      1   57     189                   none
John Adams             2   61     170             federalist
Thomas Jefferson       3   57     189  democratic-republican
James Madison          4   57     163  democratic-republican
James Monroe           5   58     183  democratic-republican 

DF Shape = (45, 4)
DF Size = 180
DF Rows = 45 & DF Column = 4


In [3]:
# We may use .head and .tail to determine whether the data has been set into the right column or not
print("HEAD & TAIL")
print(us_president.head(n=5), "\n")
print(us_president.tail())

HEAD & TAIL
                   order  age  height                  party
name                                                        
George Washington      1   57     189                   none
John Adams             2   61     170             federalist
Thomas Jefferson       3   57     189  democratic-republican
James Madison          4   57     163  democratic-republican
James Monroe           5   58     183  democratic-republican 

                   order  age  height       party
name                                             
George H. W. Bush     41   64     188  republican
Bill Clinton          42   46     188  democratic
George W. Bush        43   54     182  republican
Barack Obama          44   47     185  democratic
Donald J. Trump       45   70     191  republican


In [4]:
# And we can use .info to get the overview of the DF
us_president.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45 entries, George Washington to Donald J. Trump
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   order   45 non-null     int64 
 1   age     45 non-null     int64 
 2   height  45 non-null     int64 
 3   party   45 non-null     object
dtypes: int64(3), object(1)
memory usage: 1.8+ KB


In [5]:
# In order to find the column details, we could use this method
us_president.columns

Index(['order', 'age', 'height', 'party'], dtype='object')

In [6]:
# The result above may help us to find the size of the data of a column

# Single bracket ['age'] can be used to access a single column (Series)
print(us_president['age'].head(n=5))

# Double bracket [['age', 'height']] can be use to access multidimensional (DF)
print(us_president[['age', 'height']].head(n=5))
us_president['age'].shape

name
George Washington    57
John Adams           61
Thomas Jefferson     57
James Madison        57
James Monroe         58
Name: age, dtype: int64
                   age  height
name                          
George Washington   57     189
John Adams          61     170
Thomas Jefferson    57     189
James Madison       57     163
James Monroe        58     183


(45,)

# Indexing in Pandas

- We could use .loc to find the data label or select a data based by conditional statement. This are useful whenever you don't need to memorize the DF index.

- We may also use .iloc to find the data based on the rows.

Both of them may be used with a boolean array to subset the data.

In [7]:
# Example .loc
us_president.loc['George H. W. Bush']

order             41
age               64
height           188
party     republican
Name: George H. W. Bush, dtype: object

In [8]:
# Advanced .loc
# We could use .loc to find specific column or limit the column like this

print("Order to Party =\n", us_president.loc['George H. W. Bush', 'order':'party'], "\n")
print("Specific Column =\n", us_president.loc['George H. W. Bush', ['order', 'party']])

Order to Party =
 order             41
age               64
height           188
party     republican
Name: George H. W. Bush, dtype: object 

Specific Column =
 order            41
party    republican
Name: George H. W. Bush, dtype: object


In [9]:
# Example .iloc
us_president.iloc[:5]

Unnamed: 0_level_0,order,age,height,party
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
George Washington,1,57,189,none
John Adams,2,61,170,federalist
Thomas Jefferson,3,57,189,democratic-republican
James Madison,4,57,163,democratic-republican
James Monroe,5,58,183,democratic-republican
