In [2]:
import numpy as np
import pandas as pd

## Series

In [2]:
# create series containing the ages of 5 employees
age_lst = [25,27,24,28,30]
age_ser = pd.Series(age_lst)
age_ser

0    25
1    27
2    24
3    28
4    30
dtype: int64

In [3]:
# Create series from list with explicit index
age_ser = pd.Series([25,27,24,28,25,27,np.nan], 
                    index = ['em1', 'em2', 'em3', 'em4', 'em5', 'em6', 'em7'])
age_ser

em1    25.0
em2    27.0
em3    24.0
em4    28.0
em5    25.0
em6    27.0
em7     NaN
dtype: float64

In [4]:
age_ser.mode()

0    25.0
1    27.0
dtype: float64

In [5]:
age_ser.value_counts()

25.0    2
27.0    2
24.0    1
28.0    1
dtype: int64

## Properties

In [6]:
age_ser.size # gives the number of elements in the series

7

In [7]:
age_ser.shape # specifies the dimension and number of elements of the series

(7,)

In [8]:
age_ser.index # specifies the indexes of the series

Index(['em1', 'em2', 'em3', 'em4', 'em5', 'em6', 'em7'], dtype='object')

In [9]:
age_ser.values # show the values of the series

array([25., 27., 24., 28., 25., 27., nan])

In [10]:
age_ser.dtype # specifies the data type of the elements in the series

dtype('float64')

## Method

In [11]:
age_ser.info() 

<class 'pandas.core.series.Series'>
Index: 7 entries, em1 to em7
Series name: None
Non-Null Count  Dtype  
--------------  -----  
6 non-null      float64
dtypes: float64(1)
memory usage: 412.0+ bytes


In [12]:
age_ser.head() # 5 first elements

em1    25.0
em2    27.0
em3    24.0
em4    28.0
em5    25.0
dtype: float64

In [13]:
age_ser.tail() # 5 last elements

em3    24.0
em4    28.0
em5    25.0
em6    27.0
em7     NaN
dtype: float64

In [14]:
age_ser.head(3) # 3 first elements

em1    25.0
em2    27.0
em3    24.0
dtype: float64

In [15]:
age_ser.tail(2)

em6    27.0
em7     NaN
dtype: float64

In [16]:
age_ser.describe() # General statistics

count     6.000000
mean     26.000000
std       1.549193
min      24.000000
25%      25.000000
50%      26.000000
75%      27.000000
max      28.000000
dtype: float64

In [17]:
age_ser.std()

1.5491933384829668

In [18]:
age_ser.mean()

26.0

## Retrieve
#### Retrieve 1 element

In [None]:
# label => index tường minh
# index => 2 index ngầm định
# tenseries.loc[label]
# tenseries.iloc[index]

In [19]:
age_ser = pd.Series([25,27,24,28,30,27,24], 
                    index = ['em3', 'em2', 'em1', 'em4', 'em7', 'em6', 'em5'])
age_ser

em3    25
em2    27
em1    24
em4    28
em7    30
em6    27
em5    24
dtype: int64

In [20]:
# age of em3
age_ser.loc['em3']

25

In [21]:
age_ser.iloc[2] 

24

In [22]:
age_ser.iloc[-5]

24

In [23]:
# age of last emp
age_ser.iloc[-1]

24

In [26]:
# age of 1st emp 
age_ser.iloc[0]

25

In [25]:
# age of index 3
age_ser.iloc[3]

28

In [None]:
# Access 1 pt directly
# tenseries[label]
# tenseries[index] => label is string

In [27]:
age_ser

em3    25
em2    27
em1    24
em4    28
em7    30
em6    27
em5    24
dtype: int64

In [28]:
# age of employee em1
age_ser['em1']

24

In [29]:
# age of employee em4
age_ser[-4]

28

In [30]:
# note for ser with label as number (other than zero_based index)
lst = ['a','b','c','d']
s1 = pd.Series(lst,index=[1,2,3,4])
s1

1    a
2    b
3    c
4    d
dtype: object

#### Retrieve multiple elements

In [32]:
# tenseries.loc[label list]
# tenseries.loc[start : stop](include stop)
# tenseries.iloc[start : stop(-1) : step] => with rule
# tenseries.iloc[index list] => no rule

In [31]:
age_ser

em3    25
em2    27
em1    24
em4    28
em7    30
em6    27
em5    24
dtype: int64

In [33]:
age_ser.loc[['em1','em3']] # age of em1, em3

em1    24
em3    25
dtype: int64

In [34]:
age_ser.loc['em1':'em7'] # age of từ em1 to em7

em1    24
em4    28
em7    30
dtype: int64

In [35]:
age_ser.loc['em3':'em1']

em3    25
em2    27
em1    24
dtype: int64

In [36]:
age_ser.loc['em1':'em6']

em1    24
em4    28
em7    30
em6    27
dtype: int64

In [37]:
age_ser.iloc[1:3] # age of character in index 1,2

em2    27
em1    24
dtype: int64

In [38]:
age_ser.iloc[[0,1,4]] # age of character at index 0,1,4

em3    25
em2    27
em7    30
dtype: int64

In [39]:
# Access multiple pts directly
# tenseries[label list]
# tenseries[start:stop]
# tenseries[index list]
# tenseries[start:stop:step]

In [40]:
age_ser

em3    25
em2    27
em1    24
em4    28
em7    30
em6    27
em5    24
dtype: int64

In [41]:
age_ser[1:3]

em2    27
em1    24
dtype: int64

In [42]:
age_ser[[0,2,3]]

em3    25
em1    24
em4    28
dtype: int64

In [43]:
age_ser[['em3','em4']]

em3    25
em4    28
dtype: int64

In [44]:
age_ser['em4':'em5']

em4    28
em7    30
em6    27
em5    24
dtype: int64

#### Conditional access

In [45]:
age_ser

em3    25
em2    27
em1    24
em4    28
em7    30
em6    27
em5    24
dtype: int64

In [46]:
age_ser>25

em3    False
em2     True
em1    False
em4     True
em7     True
em6     True
em5    False
dtype: bool

In [47]:
age_ser[age_ser>25]

em2    27
em4    28
em7    30
em6    27
dtype: int64

In [48]:
# Age of employees > 25 and < 30 years old
age_ser[(age_ser>25) & (age_ser<30)] # and

em2    27
em4    28
em6    27
dtype: int64

In [49]:
# Age of employees under 25 or over 30 years old
age_ser[(age_ser<25) | (age_ser>=30)] # or

em1    24
em7    30
em5    24
dtype: int64

In [50]:
age_ser[~(age_ser>25)]

em3    25
em1    24
em5    24
dtype: int64

## Duplicate data handling

In [51]:

# Create a Series storing employees' phone numbers
phone_ser = pd.Series(['0912846759', '0914963258', '0978254361', 
                       '0335469512', '0914963258', '0914963258', '0914963258'])

# Display the Series
phone_ser


0    0912846759
1    0914963258
2    0978254361
3    0335469512
4    0914963258
5    0914963258
6    0914963258
dtype: object

In [52]:

phone_ser.duplicated() # default keep=first

0    False
1    False
2    False
3    False
4     True
5     True
6     True
dtype: bool

In [53]:
phone_ser.duplicated(keep='last')

0    False
1     True
2    False
3    False
4     True
5     True
6    False
dtype: bool

In [54]:
phone_ser.duplicated(keep=False)

0    False
1     True
2    False
3    False
4     True
5     True
6     True
dtype: bool

In [56]:
phone_ser.drop_duplicates(inplace=True)

In [57]:
phone_ser

0    0912846759
1    0914963258
2    0978254361
3    0335469512
dtype: object

In [58]:
phone_ser.drop_duplicates(keep='last')

0    0912846759
1    0914963258
2    0978254361
3    0335469512
dtype: object

In [59]:
#delete
phone_ser.drop_duplicates(keep=False) # keep: first, last, False

0    0912846759
1    0914963258
2    0978254361
3    0335469512
dtype: object

In [60]:
phone_ser

0    0912846759
1    0914963258
2    0978254361
3    0335469512
dtype: object

In [61]:
phone_ser.drop_duplicates(keep=False,inplace=True)

In [62]:
phone_ser

0    0912846759
1    0914963258
2    0978254361
3    0335469512
dtype: object

In [3]:

# Create a Series from a list
age_lst = [25, 27, 24, 28, 30]
age_ser = pd.Series(age_lst)

# Display the Series
age_ser


0    25
1    27
2    24
3    28
4    30
dtype: int64

In [4]:
# Create a Series from a NumPy array
age_arr = np.array([25, 27, 24, 28, 30])
age_ser = pd.Series(age_arr)

# Display the Series
age_ser

0    25
1    27
2    24
3    28
4    30
dtype: int64

In [5]:
age_lst = [25,27,24,28,30]
age_ser = pd.Series(age_lst,index=['an','tuan','binh','lan','cuc'])
age_ser

an      25
tuan    27
binh    24
lan     28
cuc     30
dtype: int64

In [6]:
# create series from list --> array --> series
age_lst = [25,27,24,28,30] # list
age_arr = np.array(age_lst) # array
age_ser = pd.Series(age_arr) # series
age_ser

0    25
1    27
2    24
3    28
4    30
dtype: int64

#### Index is an auto-incrementing number: 0, 1, 2, ...

In [7]:
# create series age_ser is the age of 5 employees
age_ser = pd.Series([25,27,24,28,30])

In [8]:
# in ra age_ser
age_ser

0    25
1    27
2    24
3    28
4    30
dtype: int64

#### Index is labeled ~ label index

In [9]:
# create series age_ser is the age of 5 employees
age_ser = pd.Series([25,27,24,28,30], index=[101,102,103,104,105]) # assign label to index

In [10]:
age_ser

101    25
102    27
103    24
104    28
105    30
dtype: int64

In [11]:
# create series score_ser is the score of 5 students
score_ser = pd.Series([7.5,8.0,6.0,7.0,3.5], index=['std1','std2','std3','std4','std5'])
score_ser

std1    7.5
std2    8.0
std3    6.0
std4    7.0
std5    3.5
dtype: float64

In [12]:
# Create series from txt file of baseball players' heights, in inches
height_arr = np.loadtxt('data/heights_1.txt', delimiter=',')
height_arr

array([74., 74., 72., ..., 75., 75., 73.])

In [13]:
height_arr = np.genfromtxt('data/heights_1.txt', delimiter=',') 
height_arr

array([74., 74., 72., ..., 75., 75., 73.])

In [16]:
height_ser = pd.Series(height_arr)

In [15]:
height_ser.head() # see first 5 lines

0    74.0
1    74.0
2    72.0
3    72.0
4    73.0
dtype: float64

In [17]:
height_ser.tail() # see last 5 lines

1010    73.0
1011    74.0
1012    75.0
1013    75.0
1014    73.0
dtype: float64

In [18]:
height_ser.shape

(1015,)

In [19]:
height_ser.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1015 entries, 0 to 1014
Series name: None
Non-Null Count  Dtype  
--------------  -----  
1015 non-null   float64
dtypes: float64(1)
memory usage: 8.1 KB
