# Series Introduction

### Loading Libraries

In [43]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# PyArrow
import pyarrow as pa

### A Simple Series Object

In [2]:
series = {
    'index': [0, 1, 2, 3],
    'data': [145, 142, 38, 13],
    'name': 'songs'
}

In [3]:
def get(series, idx):
    value_idx = series['index'].index(idx)
    return series['data'][value_idx]

In [4]:
get(series, 1)

142

### The Index Abstraction

In [5]:
songs = {
    'index': ['Paul', 'John', 'George', 'Ringo'],
    'data': [145, 142, 38, 13],
    'name': 'counts'
}

In [6]:
get(songs, 'John')

142

#### The `pandas` Series

In [8]:
songs2 = pd.Series([145, 142, 38, 13],
                   name='counts')

In [9]:
songs2

0    145
1    142
2     38
3     13
Name: counts, dtype: int64

In [10]:
# On PyArrow
songs3 = pd.Series([145, 142, 38, 13],
                   name='counts', dtype='int64[pyarrow]')

In [11]:
songs3

0    145
1    142
2     38
3     13
Name: counts, dtype: int64[pyarrow]

In [12]:
songs2.index

RangeIndex(start=0, stop=4, step=1)

In [14]:
songs3 = pd.Series([145, 142, 38, 13],
                   name='counts',
                   index=['Paul', 'John', 'George', 'Ringo'],
                   dtype='int64[pyarrow]')

In [15]:
songs3

Paul      145
John      142
George     38
Ringo      13
Name: counts, dtype: int64[pyarrow]

In [16]:
songs3.index

Index(['Paul', 'John', 'George', 'Ringo'], dtype='str')

In [17]:
class Foo:
    pass

In [18]:
ringo = pd.Series(
    ['Richard', 'Starkey', 13, Foo()],
    name='Ringo')

In [19]:
ringo

0                                 Richard
1                                 Starkey
2                                      13
3    <__main__.Foo object at 0x13876e950>
Name: Ringo, dtype: object

### The NA Value

In [21]:
nan_series = pd.Series([2, np.nan],
                        index=['Ono', 'Clapton'])

In [22]:
nan_series

Ono        2.0
Clapton    NaN
dtype: float64

In [23]:
nan_series2 = pd.Series([2, np.nan],
                        index=['Ono', 'Clapton'], dtype='int64[pyarrow]')

In [24]:
nan_series2

Ono           2
Clapton    <NA>
dtype: int64[pyarrow]

In [25]:
nan_series2.count()

np.int64(1)

In [26]:
nan_series2.size

2

### Similar to NumPy

In [27]:
numpy_ser = np.array([145, 142, 38, 13])

In [28]:
songs3.iloc[1]

142

In [29]:
numpy_ser[1]

np.int64(142)

In [31]:
songs3.mean()

84.5

In [32]:
numpy_ser.mean()

np.float64(84.5)

In [34]:
len(set(dir(numpy_ser)) & set(dir(songs3)))

107

In [35]:
mask = songs3 > songs3.median()

In [36]:
mask

Paul       True
John       True
George    False
Ringo     False
Name: counts, dtype: bool[pyarrow]

In [37]:
songs3[mask]

Paul    145
John    142
Name: counts, dtype: int64[pyarrow]

In [38]:
numpy_ser[numpy_ser > np.median(numpy_ser)]

array([145, 142])

### Categorical Data

In [39]:
s = pd.Series(['s', 'm', 'l'], dtype='category')

In [40]:
s

0    s
1    m
2    l
dtype: category
Categories (3, str): ['l', 'm', 's']

In [44]:
dict_type = pd.ArrowDtype(pa.dictionary(pa.int64(), pa.utf8()))

In [52]:
s = pd.Series(['m', 'l', 'xs', 's', 'xl'], dtype='dict_type')

TypeError: data type 'dict_type' not understood

In [53]:
(pd.Series(['sm', 'm', 'l'], dtype='category')
 .rename('size')
 .to_frame()
 .to_feather('/tmp/cat.ft')
)

In [54]:
# (pd.read_feather('/tmp/cat.ft', dtype='pyarrow')
#  .loc[:, 'size']
#  .dtype
# )

# Fixed Version
(df := pd.read_feather("/tmp/cat.ft", dtype_backend="pyarrow")
).loc[:, "size"].dtype

dictionary<values=large_string, indices=int8, ordered=0>[pyarrow]

In [56]:
s.cat.ordered

False

In [57]:
s2 = pd.Series(['m', 'l', 'xs', 's', 'xl'], dtype='string[pyarrow]')

In [58]:
size_type = pd.CategoricalDtype(
    categories=['s', 'm', 'l'], ordered=True)

In [59]:
s3 = s2.astype(size_type)

  s3 = s2.astype(size_type)


In [60]:
s3

0      m
1      l
2    NaN
3      s
4    NaN
dtype: category
Categories (3, str): ['s' < 'm' < 'l']

In [61]:
s3 > 's'

0     True
1     True
2    False
3    False
4    False
dtype: bool

In [62]:
s = pd.Series(['s', 'm', 'l'], dtype='category')

In [63]:
s.cat.reorder_categories(['xs', 's', 'm', 'l', 'xl'], ordered=True)

ValueError: items in new_categories are not the same as in old categories

In [64]:
s.cat.categories

Index(['l', 'm', 's'], dtype='str')

In [66]:
(s
 .cat.add_categories(['xs', 'xl', ])
 .cat_reorder_categories(['xs', 's', 'm', 'l', 'xl'],
                        ordered=True)
)

AttributeError: 'Series' object has no attribute 'cat_reorder_categories'