### Load Pandas

In [56]:
import numpy as np
import pandas as pd

### Pandas Basic

In [57]:
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9],[10,11,12]],columns=['A','B','C'],index=['w','x','y','z'])

In [58]:
df

Unnamed: 0,A,B,C
w,1,2,3
x,4,5,6
y,7,8,9
z,10,11,12


In [59]:
df.head(2)

Unnamed: 0,A,B,C
w,1,2,3
x,4,5,6


In [60]:
df.tail(3)

Unnamed: 0,A,B,C
x,4,5,6
y,7,8,9
z,10,11,12


In [61]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [62]:
df.index.to_list()

['w', 'x', 'y', 'z']

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, w to z
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       4 non-null      int64
 1   B       4 non-null      int64
 2   C       4 non-null      int64
dtypes: int64(3)
memory usage: 128.0+ bytes


In [64]:
df.describe()

Unnamed: 0,A,B,C
count,4.0,4.0,4.0
mean,5.5,6.5,7.5
std,3.872983,3.872983,3.872983
min,1.0,2.0,3.0
25%,3.25,4.25,5.25
50%,5.5,6.5,7.5
75%,7.75,8.75,9.75
max,10.0,11.0,12.0


In [65]:
df.shape

(4, 3)

In [66]:
df.size

12

In [67]:
df.nunique()

A    4
B    4
C    4
dtype: int64

In [68]:
df['A'].unique()

array([ 1,  4,  7, 10])

### Loading data from Files

In [69]:
# Loading data from csv files
coffee = pd.read_csv("./data/coffee.csv")
bios = pd.read_csv('./data/bios.csv')

In [70]:
# Loading data from paraquet files
results = pd.read_parquet('./data/results.parquet')

In [71]:
# Loading data from excel files
excel = pd.read_excel("./data/olympics-data.xlsx")

### Accessing data

In [72]:
print(coffee)

          Day Coffee Type  Units Sold
0      Monday    Espresso          25
1      Monday       Latte          15
2     Tuesday    Espresso          30
3     Tuesday       Latte          20
4   Wednesday    Espresso          35
5   Wednesday       Latte          25
6    Thursday    Espresso          40
7    Thursday       Latte          30
8      Friday    Espresso          45
9      Friday       Latte          35
10   Saturday    Espresso          45
11   Saturday       Latte          35
12     Sunday    Espresso          45
13     Sunday       Latte          35


In [73]:
display(coffee)

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [74]:
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [75]:
coffee.tail(7)

Unnamed: 0,Day,Coffee Type,Units Sold
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35
10,Saturday,Espresso,45
11,Saturday,Latte,35
12,Sunday,Espresso,45
13,Sunday,Latte,35


In [83]:
coffee.sample(5,random_state=1) # Pass in random_state to make deterministic

Unnamed: 0,Day,Coffee Type,Units Sold
3,Tuesday,Latte,20
7,Thursday,Latte,30
6,Thursday,Espresso,40
2,Tuesday,Espresso,30
10,Saturday,Espresso,45


In [86]:
# loc
# coffee.loc[Rows, Columns]
coffee.loc[1:3,["Day","Units Sold"]]

Unnamed: 0,Day,Units Sold
1,Monday,15
2,Tuesday,30
3,Tuesday,20


In [92]:
coffee.iloc[2:4,[0,2]]

Unnamed: 0,Day,Units Sold
2,Tuesday,30
3,Tuesday,20


In [93]:
# Some fancy stuff
coffee.index=coffee.Day

In [94]:
coffee.loc["Monday":"Wednesday"]

Unnamed: 0_level_0,Day,Coffee Type,Units Sold
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Monday,Monday,Espresso,25
Monday,Monday,Latte,15
Tuesday,Tuesday,Espresso,30
Tuesday,Tuesday,Latte,20
Wednesday,Wednesday,Espresso,35
Wednesday,Wednesday,Latte,25


In [96]:
coffee = pd.read_csv('./data/coffee.csv')

In [102]:
# Setting values
coffee.loc[1:3,["Units Sold"]]=10
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,10
2,Tuesday,Espresso,10
3,Tuesday,Latte,10
4,Wednesday,Espresso,35


In [106]:
# get single values (.at & .iat)
coffee.at[1,"Units Sold"]

np.int64(10)

In [108]:
coffee.iat[1,1]

'Latte'

In [115]:
# sort values
coffee.sort_values(["Units Sold"], ascending=False)

Unnamed: 0,Day,Coffee Type,Units Sold
10,Saturday,Espresso,45
8,Friday,Espresso,45
12,Sunday,Espresso,45
6,Thursday,Espresso,40
4,Wednesday,Espresso,35
11,Saturday,Latte,35
13,Sunday,Latte,35
9,Friday,Latte,35
7,Thursday,Latte,30
0,Monday,Espresso,25


In [120]:
coffee.sort_values(["Units Sold", "Coffee Type"], ascending=[1,0])

Unnamed: 0,Day,Coffee Type,Units Sold
1,Monday,Latte,10
3,Tuesday,Latte,10
2,Tuesday,Espresso,10
5,Wednesday,Latte,25
0,Monday,Espresso,25
7,Thursday,Latte,30
9,Friday,Latte,35
11,Saturday,Latte,35
13,Sunday,Latte,35
4,Wednesday,Espresso,35


In [121]:
# Iterate over dataframe using for loop
for index,row in coffee.iterrows():
    print(index)
    print(row,"\n\n\n")

0
Day              Monday
Coffee Type    Espresso
Units Sold           25
Name: 0, dtype: object 



1
Day            Monday
Coffee Type     Latte
Units Sold         10
Name: 1, dtype: object 



2
Day             Tuesday
Coffee Type    Espresso
Units Sold           10
Name: 2, dtype: object 



3
Day            Tuesday
Coffee Type      Latte
Units Sold          10
Name: 3, dtype: object 



4
Day            Wednesday
Coffee Type     Espresso
Units Sold            35
Name: 4, dtype: object 



5
Day            Wednesday
Coffee Type        Latte
Units Sold            25
Name: 5, dtype: object 



6
Day            Thursday
Coffee Type    Espresso
Units Sold           40
Name: 6, dtype: object 



7
Day            Thursday
Coffee Type       Latte
Units Sold           30
Name: 7, dtype: object 



8
Day              Friday
Coffee Type    Espresso
Units Sold           45
Name: 8, dtype: object 



9
Day            Friday
Coffee Type     Latte
Units Sold         35
Name: 9, dtype: object 




### Filtering data

In [122]:
bios.head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25


In [127]:
bios.loc[bios["height_cm"]>215]

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
5089,5108,Viktor Pankrashkin,1957-06-19,Moskva (Moscow),Moskva,RUS,Soviet Union,220.0,112.0,1993-07-24
5583,5606,Paulinho Villas Boas,1963-01-26,São Paulo,São Paulo,BRA,Brazil,217.0,106.0,
5673,5696,Gunther Behnke,1963-01-19,Leverkusen,Nordrhein-Westfalen,GER,Germany,221.0,114.0,
5716,5739,Uwe Blab,1962-03-26,München (Munich),Bayern,GER,Germany West Germany,218.0,110.0,
5781,5804,Tommy Burleson,1952-02-24,Crossnore,North Carolina,USA,United States,223.0,102.0,
5796,5819,Andy Campbell,1956-07-21,Melbourne,Victoria,AUS,Australia,218.0,93.0,
6223,6250,Lars Hansen,1954-09-27,København (Copenhagen),Hovedstaden,DEN,Canada,216.0,105.0,
6270,6298,Hu Zhangbao,1963-04-05,,,,People's Republic of China,216.0,135.0,
6409,6440,Sergey Kovalenko,1947-08-11,,,,Soviet Union,216.0,111.0,2004-11-18
6420,6451,Jānis Krūmiņš,1930-01-30,Cēsis,Cēsu novads,LAT,Soviet Union,218.0,141.0,1994-11-20


In [131]:
bios.loc[bios["height_cm"]>220,["name","height_cm"]]

Unnamed: 0,name,height_cm
5673,Gunther Behnke,221.0
5781,Tommy Burleson,223.0
6978,Arvydas Sabonis,223.0
89070,Yao Ming,226.0
89075,Roberto Dueñas,221.0
120266,Zhang Zhaoxu,221.0


In [139]:
# Short-hand syntax (without .loc)
bios[bios["height_cm"]>220][["name","height_cm"]]

Unnamed: 0,name,height_cm
5673,Gunther Behnke,221.0
5781,Tommy Burleson,223.0
6978,Arvydas Sabonis,223.0
89070,Yao Ming,226.0
89075,Roberto Dueñas,221.0
120266,Zhang Zhaoxu,221.0


In [149]:
# Multiple filter conditions
bios[(bios["height_cm"]>150) & (bios["born_country"]=="IND") & (bios["born_region"]=="Andhra Pradesh")]

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
19014,19147,Joe Antic,1931-03-13,Secunderabad,Andhra Pradesh,IND,India,168.0,59.0,2016-07-12
19085,19219,Mukesh Kumar Nandanoori,1970-04-16,Hyderabad,Andhra Pradesh,IND,India,175.0,64.0,
25607,25799,Tulasidas Balaram,1936-11-30,Secunderabad,Andhra Pradesh,IND,India,171.0,56.0,2023-02-16
25617,25809,Dharmalingam Kannan,1936-07-08,Secunderabad,Andhra Pradesh,IND,India,163.0,54.0,2006-05-19
25620,25812,Youssef Khan,1937-08-05,Hyderabad,Andhra Pradesh,IND,India,172.0,56.0,2006-07-01
25642,25834,Peter Thangaraj,1935-12-24,Hyderabad,Andhra Pradesh,IND,India,190.0,85.0,2008-11-24
46816,47165,Sheila Watt,1941-01-27,Pitapuram,Andhra Pradesh,IND,Great Britain,164.0,64.0,2023-03-10
60816,61255,Ahmed Abdul Basith,1942-01-09,Hyderabad,Andhra Pradesh,IND,India,178.0,75.0,2021-01-01
106400,107497,Sharath Kamal Achanta,1982-07-12,Machilipatnam,Andhra Pradesh,IND,India,186.0,85.0,
107178,108301,Sathi Geetha,1983-07-05,Martur,Andhra Pradesh,IND,India,156.0,52.0,


In [156]:
# Filter by string conditions
bios[bios["name"].str.contains("Sindhu",case=False)]

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
19492,19631,Tara Singh Sindhu,1943-07-19,,,,Malaysia,,,
128046,130550,P. V. Sindhu,1995-07-05,Hyderabad,Andhra Pradesh,IND,India,179.0,65.0,


In [158]:
# Regex syntax
bios[bios['name'].str.contains('Sindhu|Sumeeth ', case=False)]

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
19492,19631,Tara Singh Sindhu,1943-07-19,,,,Malaysia,,,
128046,130550,P. V. Sindhu,1995-07-05,Hyderabad,Andhra Pradesh,IND,India,179.0,65.0,
131620,134434,B. Sumeeth Reddy,1991-09-26,Gungal,Andhra Pradesh,IND,India,182.0,64.0,


In [160]:
## isin method & startswith
bios[bios['born_country'].isin(["USA", "FRA"]) & (bios['name'].str.startswith("Joe"))]

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
2741,2753,Joe Cunningham,1867-02-26,Aberdeen,Mississippi,USA,United States,,,1951-07-27
5791,5814,Joe Caldwell,1941-11-01,Texas City,Texas,USA,United States,196.0,89.0,
6101,6128,Joe Fortenberry,1911-04-01,Slidell,Texas,USA,United States,203.0,84.0,1993-06-03
6379,6410,Joe Kleine,1962-01-04,Colorado Springs,Colorado,USA,United States,211.0,122.0,
8624,8671,Joe Frazier,1944-01-12,Beaufort,South Carolina,USA,United States,183.0,89.0,2011-11-07
...,...,...,...,...,...,...,...,...,...,...
133440,136379,Joe Maloy,1985-12-20,Somers Point,New Jersey,USA,United States,175.0,65.0,
135381,138595,Joey Steggall,2003-07-17,Annecy,Haute-Savoie,FRA,Australia,,,
143679,147333,Joe Klecker,1996-11-16,Minneapolis,Minnesota,USA,United States,183.0,,
143731,147386,Joe Speer Ryan,1996-06-05,San Francisco,California,USA,United States,188.0,,


##### Query functions

In [163]:
bios.query('born_country=="IND" and born_city=="Hyderabad"')

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
281,282,S. M. Hadi,1899-08-12,Hyderabad,Andhra Pradesh,IND,India,,,1971-07-14
19085,19219,Mukesh Kumar Nandanoori,1970-04-16,Hyderabad,Andhra Pradesh,IND,India,175.0,64.0,
25620,25812,Youssef Khan,1937-08-05,Hyderabad,Andhra Pradesh,IND,India,172.0,56.0,2006-07-01
25642,25834,Peter Thangaraj,1935-12-24,Hyderabad,Andhra Pradesh,IND,India,190.0,85.0,2008-11-24
60816,61255,Ahmed Abdul Basith,1942-01-09,Hyderabad,Andhra Pradesh,IND,India,178.0,75.0,2021-01-01
116228,117613,Sania Mirza,1986-11-15,Hyderabad,Andhra Pradesh,IND,India,173.0,57.0,
118538,120264,Rushmi Chakravarthi,1977-10-09,Hyderabad,Andhra Pradesh,IND,India,170.0,65.0,
123480,125680,Parupalli Kashyap,1986-09-08,Hyderabad,Andhra Pradesh,IND,India,173.0,,
128046,130550,P. V. Sindhu,1995-07-05,Hyderabad,Andhra Pradesh,IND,India,179.0,65.0,
131636,134452,Kynan Chenai,1991-01-29,Hyderabad,Andhra Pradesh,IND,India,200.0,85.0,
