## Chapter 5 - Introduction

In [1]:
import pandas as pd

In [2]:
numbers = [1, 2, 3, 4, 5]

In [3]:
pd.Series(numbers)

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [4]:
letters = list("abcde")
letters

['a', 'b', 'c', 'd', 'e']

In [5]:
pd.Series(data=letters, index=numbers)

1    a
2    b
3    c
4    d
5    e
dtype: object

In [6]:
pd.Series(data=numbers, index=letters)

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [7]:
# Another example of index
pd.Series(data=letters, index="letter1 letter2 letter3 letter4 letter5".split())

letter1    a
letter2    b
letter3    c
letter4    d
letter5    e
dtype: object

In [8]:
# Series from dict
data = {"name": "Luigi", "age": 26, "work": "Unemployed"}

In [9]:
pd.Series(data=data)

name         Luigi
age             26
work    Unemployed
dtype: object

In [10]:
# Alternate index
pd.Series(data=data, index="name age job".split())

name    Luigi
age        26
job       NaN
dtype: object

## Chapter 6 - Attributes

In [11]:
import pandas as pd

In [12]:
numbers = [1, 2, 3, 4, 5]
letters = list("abcde")

In [13]:
number_series = pd.Series(numbers)
number_series

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [14]:
letter_series = pd.Series(letters)
letter_series

0    a
1    b
2    c
3    d
4    e
dtype: object

In [15]:
number_series.dtype

dtype('int64')

In [16]:
letter_series.dtype

dtype('O')

In [17]:
letter_series.values

array(['a', 'b', 'c', 'd', 'e'], dtype=object)

In [18]:
number_series.index

RangeIndex(start=0, stop=5, step=1)

In [19]:
# Nan: Missing data
incomplete_series = pd.Series(data=["a", "b", None], index=[1, 2, 3])
incomplete_series

1       a
2       b
3    None
dtype: object

In [20]:
incomplete_series.hasnans

True

In [21]:
# Shape: dimension: for a series, it will return a sinle tuple
# For a dataframe, it will return multiple tuples
number_series.shape

(5,)

In [22]:
# Size: length of the series
number_series.size

5

## Chapter 7 - Methods

In [23]:
import pandas as pd

In [24]:
values = pd.Series(data=[1.5, 2.5, 0.5, None, 4.12345, 1.0])

In [25]:
values

0    1.50000
1    2.50000
2    0.50000
3        NaN
4    4.12345
5    1.00000
dtype: float64

In [26]:
values.max()

np.float64(4.12345)

In [27]:
values.min()

np.float64(0.5)

In [28]:
values.sum()

np.float64(9.62345)

In [29]:
values.mean()

np.float64(1.92469)

In [30]:
values.idxmax(), values.idxmin()

(4, 2)

In [31]:
values.isnull()

0    False
1    False
2    False
3     True
4    False
5    False
dtype: bool

In [32]:
values.round()

0    2.0
1    2.0
2    0.0
3    NaN
4    4.0
5    1.0
dtype: float64

In [33]:
values.round(decimals=2)

0    1.50
1    2.50
2    0.50
3     NaN
4    4.12
5    1.00
dtype: float64

## Chapter 8 - Handling CSV Files

Data from https://www.kaggle.com/datasets/tarundalal/100-richest-people-in-world

In [34]:
import pandas as pd

In [35]:
# read_csv() returns a DataFrame, squeeze() returns a Series
richest = pd.read_csv("TopRichestInWorld.csv", usecols=["Name"]).squeeze()
richest

0                     Elon Musk
1                    Jeff Bezos
2      Bernard Arnault & family
3                    Bill Gates
4                Warren Buffett
                 ...           
96             Vladimir Potanin
97         Harold Hamm & family
98                 Sun Piaoyang
99           Luo Liguo & family
100                   Peter Woo
Name: Name, Length: 101, dtype: object

In [36]:
type(richest)

pandas.core.series.Series

In [37]:
# Save to a CSV
richest.to_csv("names.txt", index=False)

In [38]:
# Load it back
people = pd.read_csv("names.txt")
people

Unnamed: 0,Name
0,Elon Musk
1,Jeff Bezos
2,Bernard Arnault & family
3,Bill Gates
4,Warren Buffett
...,...
96,Vladimir Potanin
97,Harold Hamm & family
98,Sun Piaoyang
99,Luo Liguo & family


In [39]:
# It should be a DataFram
type(people)

pandas.core.frame.DataFrame

In [40]:
# Read csv, but use squeeze() to get series back
people = pd.read_csv("names.txt").squeeze()
people

0                     Elon Musk
1                    Jeff Bezos
2      Bernard Arnault & family
3                    Bill Gates
4                Warren Buffett
                 ...           
96             Vladimir Potanin
97         Harold Hamm & family
98                 Sun Piaoyang
99           Luo Liguo & family
100                   Peter Woo
Name: Name, Length: 101, dtype: object

In [41]:
type(people)

pandas.core.series.Series

## Chapter 9 - head() and tail()

In [42]:
import pandas as pd

In [43]:
richest = pd.read_csv("TopRichestInWorld.csv")
richest

Unnamed: 0,Name,NetWorth,Age,Country/Territory,Source,Industry
0,Elon Musk,"$219,000,000,000",50,United States,"Tesla, SpaceX",Automotive
1,Jeff Bezos,"$171,000,000,000",58,United States,Amazon,Technology
2,Bernard Arnault & family,"$158,000,000,000",73,France,LVMH,Fashion & Retail
3,Bill Gates,"$129,000,000,000",66,United States,Microsoft,Technology
4,Warren Buffett,"$118,000,000,000",91,United States,Berkshire Hathaway,Finance & Investments
...,...,...,...,...,...,...
96,Vladimir Potanin,"$17,300,000,000",61,Russia,metals,Metals & Mining
97,Harold Hamm & family,"$17,200,000,000",76,United States,oil & gas,Energy
98,Sun Piaoyang,"$17,100,000,000",63,China,pharmaceuticals,Healthcare
99,Luo Liguo & family,"$17,000,000,000",66,China,chemicals,Manufacturing


In [44]:
# First 10
richest.head(10)

Unnamed: 0,Name,NetWorth,Age,Country/Territory,Source,Industry
0,Elon Musk,"$219,000,000,000",50,United States,"Tesla, SpaceX",Automotive
1,Jeff Bezos,"$171,000,000,000",58,United States,Amazon,Technology
2,Bernard Arnault & family,"$158,000,000,000",73,France,LVMH,Fashion & Retail
3,Bill Gates,"$129,000,000,000",66,United States,Microsoft,Technology
4,Warren Buffett,"$118,000,000,000",91,United States,Berkshire Hathaway,Finance & Investments
5,Larry Page,"$111,000,000,000",49,United States,Google,Technology
6,Sergey Brin,"$107,000,000,000",48,United States,Google,Technology
7,Larry Ellison,"$106,000,000,000",77,United States,software,Technology
8,Steve Ballmer,"$91,400,000,000",66,United States,Microsoft,Technology
9,Mukesh Ambani,"$90,700,000,000",64,India,diversified,Diversified


In [45]:
# Head of all, but the last 10
richest.head(-10)

Unnamed: 0,Name,NetWorth,Age,Country/Territory,Source,Industry
0,Elon Musk,"$219,000,000,000",50,United States,"Tesla, SpaceX",Automotive
1,Jeff Bezos,"$171,000,000,000",58,United States,Amazon,Technology
2,Bernard Arnault & family,"$158,000,000,000",73,France,LVMH,Fashion & Retail
3,Bill Gates,"$129,000,000,000",66,United States,Microsoft,Technology
4,Warren Buffett,"$118,000,000,000",91,United States,Berkshire Hathaway,Finance & Investments
...,...,...,...,...,...,...
86,Vladimir Lisin,"$18,400,000,000",65,Russia,"steel, transport",Metals & Mining
87,Fan Hongwei & family,"$18,200,000,000",55,China,petrochemicals,Energy
88,Lakshmi Mittal,"$17,900,000,000",71,India,steel,Metals & Mining
89,Andrew Forrest,"$17,800,000,000",60,Australia,mining,Metals & Mining


In [46]:
# Last 10
richest.tail(10)

Unnamed: 0,Name,NetWorth,Age,Country/Territory,Source,Industry
91,Savitri Jindal & family,"$17,700,000,000",72,India,steel,Metals & Mining
92,Wang Wenyin,"$17,700,000,000",54,China,"mining, copper products",Metals & Mining
93,Li Xiting,"$17,600,000,000",71,Singapore,medical devices,Healthcare
94,Stefan Persson,"$17,600,000,000",74,Sweden,H&M,Fashion & Retail
95,Steve Cohen,"$17,400,000,000",65,United States,hedge funds,Finance & Investments
96,Vladimir Potanin,"$17,300,000,000",61,Russia,metals,Metals & Mining
97,Harold Hamm & family,"$17,200,000,000",76,United States,oil & gas,Energy
98,Sun Piaoyang,"$17,100,000,000",63,China,pharmaceuticals,Healthcare
99,Luo Liguo & family,"$17,000,000,000",66,China,chemicals,Manufacturing
100,Peter Woo,"$17,000,000,000",75,Hong Kong,real estate,Real Estate


In [47]:
# Last of all but the first 10
richest.tail(-10)

Unnamed: 0,Name,NetWorth,Age,Country/Territory,Source,Industry
10,Gautam Adani & family,"$90,000,000,000",59,India,"infrastructure, commodities",Diversified
11,Michael Bloomberg,"$82,000,000,000",80,United States,Bloomberg LP,Media & Entertainment
12,Carlos Slim Helu & family,"$81,200,000,000",82,Mexico,telecom,Telecom
13,Francoise Bettencourt Meyers & family,"$74,800,000,000",68,France,L'Oréal,Fashion & Retail
14,Mark Zuckerberg,"$67,300,000,000",37,United States,Facebook,Technology
...,...,...,...,...,...,...
96,Vladimir Potanin,"$17,300,000,000",61,Russia,metals,Metals & Mining
97,Harold Hamm & family,"$17,200,000,000",76,United States,oil & gas,Energy
98,Sun Piaoyang,"$17,100,000,000",63,China,pharmaceuticals,Healthcare
99,Luo Liguo & family,"$17,000,000,000",66,China,chemicals,Manufacturing


## Chapter 10 - Sorting values in Series

In [48]:
import pandas as pd

In [49]:
richest = pd.read_csv("TopRichestInWorld.csv", usecols=["Name"]).squeeze()
richest

0                     Elon Musk
1                    Jeff Bezos
2      Bernard Arnault & family
3                    Bill Gates
4                Warren Buffett
                 ...           
96             Vladimir Potanin
97         Harold Hamm & family
98                 Sun Piaoyang
99           Luo Liguo & family
100                   Peter Woo
Name: Name, Length: 101, dtype: object

In [50]:
type(richest)

pandas.core.series.Series

In [51]:
# Note the index
richest.sort_values(
    axis=0,
    ascending=True,
    kind='quicksort',
    na_position='last',
)

74         Abigail Johnson
42        Alain Wertheimer
17            Alice Walton
22          Amancio Ortega
89          Andrew Forrest
              ...         
4           Warren Buffett
54        William Lei Ding
85    Yang Huiyan & family
24            Zhang Yiming
16          Zhong Shanshan
Name: Name, Length: 101, dtype: object

In [52]:
# Fix up the index
richest.sort_values(
    axis=0,
    ascending=True,
    kind='quicksort',
    na_position='last',
    ignore_index=True,
)

0           Abigail Johnson
1          Alain Wertheimer
2              Alice Walton
3            Amancio Ortega
4            Andrew Forrest
               ...         
96           Warren Buffett
97         William Lei Ding
98     Yang Huiyan & family
99             Zhang Yiming
100          Zhong Shanshan
Name: Name, Length: 101, dtype: object

In [53]:
# Note that the data is not altered
richest

0                     Elon Musk
1                    Jeff Bezos
2      Bernard Arnault & family
3                    Bill Gates
4                Warren Buffett
                 ...           
96             Vladimir Potanin
97         Harold Hamm & family
98                 Sun Piaoyang
99           Luo Liguo & family
100                   Peter Woo
Name: Name, Length: 101, dtype: object

In [54]:
# Alter the data
richest.sort_values(
    axis=0,
    ascending=True,
    inplace=True,
    kind='quicksort',
    na_position='last',
    ignore_index=True,
)

ValueError: This Series is a view of some other array, to sort in-place you must create a copy

In [55]:
richest

0                     Elon Musk
1                    Jeff Bezos
2      Bernard Arnault & family
3                    Bill Gates
4                Warren Buffett
                 ...           
96             Vladimir Potanin
97         Harold Hamm & family
98                 Sun Piaoyang
99           Luo Liguo & family
100                   Peter Woo
Name: Name, Length: 101, dtype: object

In [56]:
richest.sort_index(
    ascending=False,
)

100                   Peter Woo
99           Luo Liguo & family
98                 Sun Piaoyang
97         Harold Hamm & family
96             Vladimir Potanin
                 ...           
4                Warren Buffett
3                    Bill Gates
2      Bernard Arnault & family
1                    Jeff Bezos
0                     Elon Musk
Name: Name, Length: 101, dtype: object

## Chapter 11 - Counting values in a Series

In [57]:
import pandas as pd

In [58]:
richest = pd.read_csv("TopRichestInWorld.csv", usecols=["Industry"]).squeeze()
richest

0                 Automotive
1                 Technology
2           Fashion & Retail
3                 Technology
4      Finance & Investments
               ...          
96           Metals & Mining
97                    Energy
98                Healthcare
99             Manufacturing
100              Real Estate
Name: Industry, Length: 101, dtype: object

In [59]:
richest.value_counts()

Industry
Fashion & Retail         18
Technology               15
Finance & Investments    13
Metals & Mining           9
Food & Beverage           8
Automotive                7
Diversified               6
Manufacturing             5
Healthcare                5
Media & Entertainment     4
Real Estate               3
Telecom                   2
Logistics                 2
Energy                    2
Gambling & Casinos        1
Service                   1
Name: count, dtype: int64

In [60]:
# Count, get percentage
richest.value_counts(normalize=True)

Industry
Fashion & Retail         0.178218
Technology               0.148515
Finance & Investments    0.128713
Metals & Mining          0.089109
Food & Beverage          0.079208
Automotive               0.069307
Diversified              0.059406
Manufacturing            0.049505
Healthcare               0.049505
Media & Entertainment    0.039604
Real Estate              0.029703
Telecom                  0.019802
Logistics                0.019802
Energy                   0.019802
Gambling & Casinos       0.009901
Service                  0.009901
Name: proportion, dtype: float64

In [61]:
# By default, count sorts by frequency, but set sort = False changes that
richest.value_counts(sort=False)

Industry
Automotive                7
Technology               15
Fashion & Retail         18
Finance & Investments    13
Diversified               6
Media & Entertainment     4
Telecom                   2
Food & Beverage           8
Logistics                 2
Real Estate               3
Metals & Mining           9
Manufacturing             5
Gambling & Casinos        1
Healthcare                5
Service                   1
Energy                    2
Name: count, dtype: int64

In [62]:
richest = pd.read_csv("TopRichestInWorld.csv", usecols=["Age"]).squeeze()
richest

0      50
1      58
2      73
3      66
4      91
       ..
96     61
97     76
98     63
99     66
100    75
Name: Age, Length: 101, dtype: int64

In [63]:
# The bins parameter create N bins, which group data
richest.value_counts(bins=5)

(55.6, 68.4]      33
(68.4, 81.2]      27
(81.2, 94.0]      19
(42.8, 55.6]      17
(29.935, 42.8]     5
Name: count, dtype: int64

## Chapter 12 - Accessing elements via position

In [64]:
import pandas as pd

richest = pd.read_csv("TopRichestInWorld.csv", usecols=["Name"]).squeeze()
richest

0                     Elon Musk
1                    Jeff Bezos
2      Bernard Arnault & family
3                    Bill Gates
4                Warren Buffett
                 ...           
96             Vladimir Potanin
97         Harold Hamm & family
98                 Sun Piaoyang
99           Luo Liguo & family
100                   Peter Woo
Name: Name, Length: 101, dtype: object

In [65]:
richest.head(15)

0                                 Elon Musk
1                                Jeff Bezos
2                  Bernard Arnault & family
3                                Bill Gates
4                            Warren Buffett
5                                Larry Page
6                               Sergey Brin
7                             Larry Ellison
8                             Steve Ballmer
9                             Mukesh Ambani
10                    Gautam Adani & family
11                        Michael Bloomberg
12                Carlos Slim Helu & family
13    Francoise Bettencourt Meyers & family
14                          Mark Zuckerberg
Name: Name, dtype: object

In [66]:
# First data
richest[0]

'Elon Musk'

In [67]:
# Out of range
try:
    richest[1_000_000]
except KeyError as error:
    print(f"Error: {error}")

Error: 1000000


In [68]:
# Out of range, not: unlike Python list, -1 is out of range
try:
    richest[-1]
except KeyError as error:
    print("ERROR")
    print(error)

ERROR
-1


In [69]:
# To access the last element, use a special slice notation
richest[-1:]

100    Peter Woo
Name: Name, dtype: object

In [70]:
# Multiple indices
richest[[1, 2, 3]]

1                  Jeff Bezos
2    Bernard Arnault & family
3                  Bill Gates
Name: Name, dtype: object

In [71]:
# Sices
richest[1:5]

1                  Jeff Bezos
2    Bernard Arnault & family
3                  Bill Gates
4              Warren Buffett
Name: Name, dtype: object

In [72]:
richest.index

RangeIndex(start=0, stop=101, step=1)

In [73]:
richest.size

101

## Chapter 13 - Accessing elements via index

In [74]:
import pandas as pd

richest = pd.read_csv(
    "TopRichestInWorld.csv",
    usecols=["Name", "NetWorth"],
    index_col="Name",
).squeeze()
richest

Name
Elon Musk                   $219,000,000,000
Jeff Bezos                  $171,000,000,000
Bernard Arnault & family    $158,000,000,000
Bill Gates                  $129,000,000,000
Warren Buffett              $118,000,000,000
                                  ...       
Vladimir Potanin             $17,300,000,000
Harold Hamm & family         $17,200,000,000
Sun Piaoyang                 $17,100,000,000
Luo Liguo & family           $17,000,000,000
Peter Woo                    $17,000,000,000
Name: NetWorth, Length: 101, dtype: object

In [75]:
# Still be able to access using index
richest.iloc[0]

'$219,000,000,000'

In [76]:
# Access networth by index, which is the name
richest["Bill Gates"]

'$129,000,000,000'

In [77]:
richest[["Bill Gates", "Jeff Bezos", "Warren Buffett"]]

Name
Bill Gates        $129,000,000,000
Jeff Bezos        $171,000,000,000
Warren Buffett    $118,000,000,000
Name: NetWorth, dtype: object

In [78]:
# Slices
richest[:"Steve Ballmer"]

Name
Elon Musk                   $219,000,000,000
Jeff Bezos                  $171,000,000,000
Bernard Arnault & family    $158,000,000,000
Bill Gates                  $129,000,000,000
Warren Buffett              $118,000,000,000
Larry Page                  $111,000,000,000
Sergey Brin                 $107,000,000,000
Larry Ellison               $106,000,000,000
Steve Ballmer                $91,400,000,000
Name: NetWorth, dtype: object

In [79]:
# Slices are inclusive
richest["Bill Gates":"Steve Ballmer"]

Name
Bill Gates        $129,000,000,000
Warren Buffett    $118,000,000,000
Larry Page        $111,000,000,000
Sergey Brin       $107,000,000,000
Larry Ellison     $106,000,000,000
Steve Ballmer      $91,400,000,000
Name: NetWorth, dtype: object

In [80]:
richest.index

Index(['Elon Musk', 'Jeff Bezos', 'Bernard Arnault & family', 'Bill Gates',
       'Warren Buffett', 'Larry Page', 'Sergey Brin', 'Larry Ellison',
       'Steve Ballmer', 'Mukesh Ambani',
       ...
       'Savitri Jindal & family', 'Wang Wenyin', 'Li Xiting', 'Stefan Persson',
       'Steve Cohen', 'Vladimir Potanin', 'Harold Hamm & family',
       'Sun Piaoyang', 'Luo Liguo & family', 'Peter Woo'],
      dtype='object', name='Name', length=101)

## Chapter 14 - Homework

### Import pandas with its alias

In [81]:
import pandas as pd

### Load the TopRich... file and make name the index and Age the column

In [82]:
s = pd.read_csv("TopRichestInWorld.csv", usecols=["Name", "Age"], index_col="Name").squeeze()
s

Name
Elon Musk                   50
Jeff Bezos                  58
Bernard Arnault & family    73
Bill Gates                  66
Warren Buffett              91
                            ..
Vladimir Potanin            61
Harold Hamm & family        76
Sun Piaoyang                63
Luo Liguo & family          66
Peter Woo                   75
Name: Age, Length: 101, dtype: int64

### Which person is the oldest, how old is he/she?

In [83]:
print(f"The oldest is {s.idxmax()} at {s.max()}")

The oldest is Lee Shau Kee at 94


### Sort from oldest to youngest to verify

In [84]:
s.sort_values(ascending=False).head(10)

Name
Lee Shau Kee                     94
Li Ka-shing                      93
Warren Buffett                   91
Rupert Murdoch & family          91
Leonard Lauder                   89
Leonardo Del Vecchio & family    86
Charles Koch                     86
John Mars                        86
Reinhold Wuerth & family         86
Amancio Ortega                   86
Name: Age, dtype: int64

### Which person is the youngest?

In [85]:
print(f"The youngest is {s.idxmin()} at {s.min()}")

The youngest is Sam Bankman-Fried at 30


### Sort to verify

In [86]:
s.sort_values()

Name
Sam Bankman-Fried          30
Mark Zuckerberg            37
Zhang Yiming               38
Guillaume Pousaz           40
Yang Huiyan & family       40
                           ..
Leonard Lauder             89
Rupert Murdoch & family    91
Warren Buffett             91
Li Ka-shing                93
Lee Shau Kee               94
Name: Age, Length: 101, dtype: int64

### Average age

In [87]:
print(f"Average age: {s.mean().round(decimals=2)}")

Average age: 67.12


### Save as CSV

In [88]:
s.to_csv("NamesAndAges.csv")

### Read It Back

In [93]:
s2 = pd.read_csv("NamesAndAges.csv", index_col=["Name"]).squeeze()

In [94]:
type(s2)

pandas.core.series.Series

In [95]:
s2

Name
Elon Musk                   50
Jeff Bezos                  58
Bernard Arnault & family    73
Bill Gates                  66
Warren Buffett              91
                            ..
Vladimir Potanin            61
Harold Hamm & family        76
Sun Piaoyang                63
Luo Liguo & family          66
Peter Woo                   75
Name: Age, Length: 101, dtype: int64