### Pandas Series

In [1]:
import pandas as pd

In [2]:
titanic = pd.read_csv('../Course_Materials_Part1/Video_Lecture_NBs/titanic.csv')

In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [4]:
titanic['age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [5]:
type(titanic['age'])

pandas.core.series.Series

In [6]:
titanic['age'].equals(titanic.age)

True

In [7]:
age = titanic['age']

In [8]:
age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64

In [9]:
age.tail()

886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, dtype: float64

In [10]:
age.dtype

dtype('float64')

In [11]:
age.shape

(891,)

In [12]:
len(age)

891

In [13]:
age.index

RangeIndex(start=0, stop=891, step=1)

In [14]:
age.info()

# Error
# .info() method only exists on dataframe not series

# Workout around is convert to dataframe and call .info() on it

AttributeError: 'Series' object has no attribute 'info'

In [15]:
age.to_frame()

Unnamed: 0,age
0,22.0
1,38.0
2,26.0
3,35.0
4,35.0
...,...
886,27.0
887,19.0
888,
889,26.0


In [16]:
age.to_frame().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     714 non-null    float64
dtypes: float64(1)
memory usage: 7.1 KB


### Analysing numerical series

In [17]:
age

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [18]:
age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64

In [19]:
age.count()

714

In [20]:
age.size

891

In [21]:
len(age)

891

In [22]:
age.sum()

21205.17

In [23]:
sum(age) # Python's default sum does not know how to handle NaN values

nan

In [24]:
age.mean()

29.69911764705882

In [25]:
age.median()

28.0

In [26]:
age.std()

14.526497332334042

In [27]:
age.min()

0.42

In [28]:
age.max()

80.0

In [29]:
age.unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [30]:
len(age.unique())

89

In [31]:
age.nunique()

88

In [32]:
age.value_counts()

24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
         ..
55.50     1
70.50     1
66.00     1
23.50     1
0.42      1
Name: age, Length: 88, dtype: int64

In [33]:
age.value_counts(sort=True)

24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
         ..
55.50     1
70.50     1
66.00     1
23.50     1
0.42      1
Name: age, Length: 88, dtype: int64

In [34]:
age.value_counts(dropna=True)

24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
         ..
55.50     1
70.50     1
66.00     1
23.50     1
0.42      1
Name: age, Length: 88, dtype: int64

In [35]:
age.value_counts(ascending=False)

24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
         ..
55.50     1
70.50     1
66.00     1
23.50     1
0.42      1
Name: age, Length: 88, dtype: int64

In [36]:
age.value_counts(sort=True, dropna=True, ascending=False, normalize=False)

24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
         ..
55.50     1
70.50     1
66.00     1
23.50     1
0.42      1
Name: age, Length: 88, dtype: int64

In [37]:
age.value_counts(sort=True, dropna=True, ascending=False, normalize=True)

24.00    0.042017
22.00    0.037815
18.00    0.036415
19.00    0.035014
30.00    0.035014
           ...   
55.50    0.001401
70.50    0.001401
66.00    0.001401
23.50    0.001401
0.42     0.001401
Name: age, Length: 88, dtype: float64

In [38]:
30/age.size

0.03367003367003367

In [39]:
30/age.count()

0.04201680672268908

In [40]:
age.value_counts(sort=True, dropna=True, ascending=False, normalize=False, bins=5)

(16.336, 32.252]    346
(32.252, 48.168]    188
(0.339, 16.336]     100
(48.168, 64.084]     69
(64.084, 80.0]       11
Name: age, dtype: int64

In [41]:
age.value_counts(sort=True, dropna=True, ascending=False, normalize=True, bins=10)

(16.336, 24.294]    0.198653
(24.294, 32.252]    0.189675
(32.252, 40.21]     0.132435
(40.21, 48.168]     0.078563
(0.339, 8.378]      0.060606
(8.378, 16.336]     0.051627
(48.168, 56.126]    0.050505
(56.126, 64.084]    0.026936
(64.084, 72.042]    0.010101
(72.042, 80.0]      0.002245
Name: age, dtype: float64

In [42]:
age.value_counts(sort=True, dropna=False, ascending=False, normalize=True).sum()

0.9999999999999999