# pandas Tutorial

Pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.

in this workbook

What is Data Frames?
What is Data Series?
Different operation in Pandas

In [1]:
import pandas as pd
import numpy as np

Dataframe: many data sets used in machine learning, excel files, csv files etc. pandas make them similar to excel sheets in columns and rows for analysis.

In [2]:
# create a DataFrame using the DataFrame inbuilt function
df = pd.DataFrame(np.arange(0,20).reshape(5,4), index=['Row1', 'Row2', 'Row3', 'Row4', 'Row5'], columns=['Column1', 'Column2', 'Column3', 'Column4'])
df
# if the data is too large, we can use the head() function to display the first 5 rows

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [3]:
# convert the DataFrame to a CSV file and read it in excel
# df.to_csv('Test.csv')
# uncomment the line above to create the CSV file

In [4]:
# access the elements in the DataFrame or indexing
# one was is using .loc for row index and the other is .iloc for row and column index

df.loc['Row1']

Column1    0
Column2    1
Column3    2
Column4    3
Name: Row1, dtype: int64

In [5]:
print(type(df))
print(type(df.loc['Row1']))
print(type(df.iloc[0:1, 0:1]))
# when we have one row or one column, it is a series

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [6]:
df.shape

(5, 4)

In [7]:
# to access multiple rows, we can use the slicing method with .iloc
# this is similar to numpy slicing
# left of the comma is the row and right of the comma is the column
df.iloc[0:2, 0:2]


Unnamed: 0,Column1,Column2
Row1,0,1
Row2,4,5


In [8]:
# convert the DataFrame to an array
# this removes the row and column labels
df.iloc[:, :].values
# .values is used to convert the DataFrame to an array

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [9]:
# check for null values
df.isnull().sum()

Column1    0
Column2    0
Column3    0
Column4    0
dtype: int64

In [10]:
# .value_counts() is used to get the count of unique values in a column
df['Column1'].value_counts()

Column1
0     1
4     1
8     1
12    1
16    1
Name: count, dtype: int64

In [11]:
# .unique() is used to get the unique values in a column
df['Column1'].unique()

array([ 0,  4,  8, 12, 16])

In [12]:
df['Column1']

Row1     0
Row2     4
Row3     8
Row4    12
Row5    16
Name: Column1, dtype: int64

In [13]:
type(df['Column1'])

pandas.core.series.Series

In [14]:
df[['Column1', 'Column2']]
# this is a DataFrame and you have to use double square brackets 
# to access multiple columns in a DataFrame as it is a 2D array
# so we have to put the columns in a list []

Unnamed: 0,Column1,Column2
Row1,0,1
Row2,4,5
Row3,8,9
Row4,12,13
Row5,16,17


In [15]:
!pwd

/Users/kavehnamvar/Documents/learn to program/Python learning 2025/LearnPython2025


In [None]:
# read a CSV file, here the file mercedesbenz.csv is in the same directory as the notebook
df = pd.read_csv('mercedesbenz.csv', sep=',')
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# df.info() is used to get the information about the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


In [20]:
print(df.shape)
print(type(df))

(4209, 378)
<class 'pandas.core.frame.DataFrame'>


In [None]:
# df.describe() is used to get the statistical information about the DataFrame
df.describe()
# in describe we only get the statistical information about the numerical columns (int and float)
# the object columns or the categorical ones are not included

Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4205.960798,100.669318,0.013305,0.0,0.075077,0.057971,0.42813,0.000475,0.002613,0.007603,...,0.318841,0.057258,0.314802,0.02067,0.009503,0.008078,0.007603,0.001663,0.000475,0.001426
std,2437.608688,12.679381,0.11459,0.0,0.263547,0.233716,0.494867,0.021796,0.051061,0.086872,...,0.466082,0.232363,0.464492,0.142294,0.097033,0.089524,0.086872,0.040752,0.021796,0.037734
min,0.0,72.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2095.0,90.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4220.0,99.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6314.0,109.01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8417.0,265.32,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
# get the unique values in a column, here X0
df['X0'].value_counts().head()

X0
z     360
ak    349
y     324
ay    313
t     306
Name: count, dtype: int64

In [None]:
# getting all the unique values in a column
df['X0'].unique()

array(['k', 'az', 't', 'al', 'o', 'w', 'j', 'h', 's', 'n', 'ay', 'f', 'x',
       'y', 'aj', 'ak', 'am', 'z', 'q', 'at', 'ap', 'v', 'af', 'a', 'e',
       'ai', 'd', 'aq', 'c', 'aa', 'ba', 'as', 'i', 'r', 'b', 'ax', 'bc',
       'u', 'ad', 'au', 'm', 'l', 'aw', 'ao', 'ac', 'g', 'ab'],
      dtype=object)

In [33]:
# logical operations in pandas
df[df['y']>150]

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
253,505,150.43,t,b,as,c,d,i,l,x,...,0,0,1,0,0,0,0,1,0,0
342,681,169.91,aa,l,ak,f,d,i,c,d,...,0,0,0,0,0,0,0,0,0,0
429,836,154.87,ak,l,ae,f,d,d,g,w,...,0,0,0,0,0,0,0,0,0,0
883,1770,265.32,y,r,ai,f,d,ag,l,t,...,0,0,0,0,0,0,0,0,0,0
889,1784,158.53,aj,l,as,f,d,ag,k,e,...,0,0,0,0,0,0,0,0,0,0
1060,2111,154.43,w,v,r,c,d,ag,d,q,...,1,0,0,0,0,0,0,0,0,0
1203,2396,160.87,j,o,as,f,d,ab,g,p,...,1,0,0,0,0,0,0,0,0,0
1205,2403,150.89,x,b,m,c,d,ab,j,j,...,0,0,1,0,0,0,0,0,0,0
1269,2511,152.32,s,aa,m,c,d,ab,g,g,...,1,0,0,0,0,0,0,0,0,0
1459,2903,167.45,ai,b,ae,a,d,ac,g,m,...,0,0,1,0,0,0,0,0,0,0


In [37]:
df['y'].max()

265.32

In [35]:
max(df['y'])

265.32

## CSV

In [40]:
from io import StringIO, BytesIO

In [48]:
data = ('col1,col2,col3\n'
        'x,y,1\n'
        'a,b,2\n'
        'c,d,3\n')
type(data)

str

In [49]:
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,x,y,1
1,a,b,2
2,c,d,3


In [51]:
# read specific columns by usecols parameter
df = pd.read_csv(StringIO(data), usecols=['col1', 'col3'])
df
df.to_csv('Test.csv')

In [57]:
# specifying columns data types
data = ('a,b,c,d\n'
        '1,2,3,4\n'
        '5,6,7,8\n'
        '9,10,11,12')
print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11,12


In [58]:
df = pd.read_csv(StringIO(data), dtype=object) # object means string
df
# we could have dtype=int or dtype=float
# here the numbers are read as strings

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,5,6,7,8
2,9,10,11,12


In [59]:
df['a']

0    1
1    5
2    9
Name: a, dtype: object

In [60]:
df['a'][1]
# this is a string

'5'

In [64]:
df = pd.read_csv(StringIO(data), dtype=int) # object means string
print(df['a'])
df['a'][1]
# this is an integer

0    1
1    5
2    9
Name: a, dtype: int64


5

In [66]:
# specifying columns data types per column
df = pd.read_csv(StringIO(data), dtype={'b':int, 'c':float, 'a':'Int64'}) 
# in dtype parameter, we can specify the data type of each column in a dictionary
df

Unnamed: 0,a,b,c,d
0,1,2,3.0,4
1,5,6,7.0,8
2,9,10,11.0,12


In [68]:
type(df['a'][1])

numpy.int64

In [69]:
df.dtypes

a      Int64
b      int64
c    float64
d      int64
dtype: object

In [80]:
data = ('index,a,b,c\n'
        '4,apple,bat,5.7\n'
        '8,orange,cow,10')  
pd.read_csv(StringIO(data)) 
# here the index column is read as a normal column
# the DataFrame has its own default index 0,1,2,etc.

Unnamed: 0,index,a,b,c
0,4,apple,bat,5.7
1,8,orange,cow,10.0


In [81]:
# to read it as an index column, we can use the index_col parameter
pd.read_csv(StringIO(data), index_col=0)

Unnamed: 0_level_0,a,b,c
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,apple,bat,5.7
8,orange,cow,10.0


In [None]:
data = ('a,b,c\n'
        '4,apple,bat,\n'
        '8,orange,cow,')
pd.read_csv(StringIO(data))
# here the last column has a comma at the end

Unnamed: 0,a,b,c
4,apple,bat,
8,orange,cow,


In [85]:
pd.read_csv(StringIO(data), index_col=False)

Unnamed: 0,a,b,c
0,4,apple,bat
1,8,orange,cow


In [86]:
# combining usecols and index_col
data = ('a,b,c\n'
        '4,apple,bat,\n'
        '8,orange,cow,') # the same data as above
pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=False)

Unnamed: 0,b,c
0,apple,bat
1,orange,cow


In [87]:
# quoting and escape characters, usefule in NLP
data = 'a,b\n"hello, \\"Bob\\", nice to see you",5'
data

'a,b\n"hello, \\"Bob\\", nice to see you",5'

In [92]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b
"hello, \Bob\""","nice to see you""",5


In [89]:
pd.read_csv(StringIO(data), escapechar='\\')
# escapechar is used to escape the quotes

Unnamed: 0,a,b
0,"hello, ""Bob"", nice to see you",5


In [99]:
# if we have \t or tab separated values
# url to CSV
# df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item', sep='\t')
# df.head()
# urls are usually in tab separated values