# Of Python Types: NumPy & PyArrow (Pandas)

### Loading Libraries

In [1]:
# System
import sys

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# PyArrow
import pyarrow as pa

### Python Types

In [2]:
x = 127

In [3]:
x + 1

128

In [4]:
sys.getsizeof(x)

28

### NumPy

In [5]:
array = np.arange(10, dtype='int32')

In [6]:
array += 1

### Differences in Numpy & Python

In [7]:
n1 = np.array([1], dtype='uint8')

In [8]:
n255 = np.array([255], dtype='uint8')

In [9]:
n1 + n255

array([0], dtype=uint8)

In [10]:
demo = np.array([1, 2, 3, np.nan])

In [11]:
demo

array([ 1.,  2.,  3., nan])

### Integer Types

In [12]:
small_values = [1, 99, 127]

large_values = [2**31, 2**63, 2**100]

missing_values = [None, 1, -45]

In [13]:
small_ser = pd.Series(small_values)

In [14]:
small_ser

0      1
1     99
2    127
dtype: int64

In [15]:
small_ser.astype('int8')

0      1
1     99
2    127
dtype: int8

In [16]:
small_ser = pd.Series(small_values, dtype='int8')
small_ser

0      1
1     99
2    127
dtype: int8

In [17]:
# With PyArrow
small_ser_pa = pd.Series(small_values, dtype='int8[pyarrow]')
small_ser_pa

0      1
1     99
2    127
dtype: int8[pyarrow]

In [18]:
larger_ser = pd.Series(large_values)
larger_ser

0                         2147483648
1                9223372036854775808
2    1267650600228229401496703205376
dtype: object

In [19]:
larger_ser = pd.Series(larger_ser, dtype='int64[pyarrow]')

OverflowError: Python int too large to convert to C long

In [None]:
missing_ser = pd.Series(missing_values)
missing_ser

In [20]:
missing_ser_pa = pd.Series(missing_values, dtype='int8[pyarrow]')
missing_ser_pa

0    <NA>
1       1
2     -45
dtype: int8[pyarrow]

In [21]:
medium_values = [2**15+5, 2**31-8, 2**63]

medium_ser = pd.Series(medium_values)
medium_ser

0                  32773
1             2147483640
2    9223372036854775808
dtype: uint64

In [22]:
medium_ser.astype('int8')

0    5
1   -8
2    0
dtype: int8

In [23]:
medium_ser.astype('int8[pyarrow]')

ArrowInvalid: Integer value 32773 not in range: 0 to 127

### Floating Point Types

In [24]:
float_vals = [1.5, 2.7, 127.0]

float_missing = [None, 1.5, -45.0]

float_rain = [1.5, 2.7, 0.0, 'T', 1.5, 0]

In [25]:
pd.Series(float_vals)

0      1.5
1      2.7
2    127.0
dtype: float64

In [26]:
pd.Series(float_missing)

0     NaN
1     1.5
2   -45.0
dtype: float64

In [27]:
pd.Series(float_vals, dtype='float64[pyarrow]')

0      1.5
1      2.7
2    127.0
dtype: double[pyarrow]

In [28]:
pd.Series(float_missing, dtype='float64[pyarrow]')

0    <NA>
1     1.5
2   -45.0
dtype: double[pyarrow]

In [29]:
pd.Series(float_rain)

0    1.5
1    2.7
2    0.0
3      T
4    1.5
5      0
dtype: object

In [30]:
pd.Series(float_rain).replace('T', '0.0').astype('float64')

0    1.5
1    2.7
2    0.0
3    0.0
4    1.5
5    0.0
dtype: float64

In [31]:
pd.Series(float_rain).replace('T', 0).astype('float64[pyarrow]')

0    1.5
1    2.7
2    0.0
3    0.0
4    1.5
5    0.0
dtype: double[pyarrow]

In [32]:
(pd.Series(float_rain)
    .replace('T', 0)
    .astype('float64[pyarrow]')
)

0    1.5
1    2.7
2    0.0
3    0.0
4    1.5
5    0.0
dtype: double[pyarrow]

### String Data

In [33]:
string_pa = pd.ArrowDtype(pa.string())

In [34]:
text_freeform = ['My name is Jeff', 'I like pandas',
                 'I like programming']

In [35]:
text_with_missing = ['My name is Jeff', None, 'I like programming']

In [36]:
pd.Series(text_freeform)

0       My name is Jeff
1         I like pandas
2    I like programming
dtype: str

In [37]:
pd.Series(text_with_missing)

0       My name is Jeff
1                   NaN
2    I like programming
dtype: str

In [38]:
tf1 = pd.Series(text_freeform, dtype=string_pa)
tf1

0       My name is Jeff
1         I like pandas
2    I like programming
dtype: string[pyarrow]

In [39]:
tf2 = pd.Series(text_freeform, dtype='string[pyarrow]')
tf2

0       My name is Jeff
1         I like pandas
2    I like programming
dtype: string

In [40]:
tf1.dtype == tf2.dtype

False

In [41]:
pd.Series(text_with_missing, dtype=string_pa)

0       My name is Jeff
1                  <NA>
2    I like programming
dtype: string[pyarrow]

### Categorical Data

In [42]:
states = ['CA', 'NY', 'TX']

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
          'Sep', 'Oct', 'Nov', 'Dec']

In [43]:
pd.Series(states, dtype='category')

0    CA
1    NY
2    TX
dtype: category
Categories (3, str): ['CA', 'NY', 'TX']

In [44]:
pd.Series(months, dtype='category')

0     Jan
1     Feb
2     Mar
3     Apr
4     May
5     Jun
6     Jul
7     Aug
8     Sep
9     Oct
10    Nov
11    Dec
dtype: category
Categories (12, str): ['Apr', 'Aug', 'Dec', 'Feb', ..., 'May', 'Nov', 'Oct', 'Sep']

In [45]:
# Not sorted
pd.Series(months, dtype='category').sort_values()

3     Apr
7     Aug
11    Dec
1     Feb
0     Jan
6     Jul
5     Jun
2     Mar
4     May
10    Nov
9     Oct
8     Sep
dtype: category
Categories (12, str): ['Apr', 'Aug', 'Dec', 'Feb', ..., 'May', 'Nov', 'Oct', 'Sep']

In [46]:
month_cat = pd.CategoricalDtype(categories=months, ordered=True)

In [47]:
pd.Series(months, dtype=month_cat).sort_values()

0     Jan
1     Feb
2     Mar
3     Apr
4     May
5     Jun
6     Jul
7     Aug
8     Sep
9     Oct
10    Nov
11    Dec
dtype: category
Categories (12, str): ['Jan' < 'Feb' < 'Mar' < 'Apr' ... 'Sep' < 'Oct' < 'Nov' < 'Dec']

In [48]:
pd.Series(months, dtype=string_pa).astype(month_cat)

0     Jan
1     Feb
2     Mar
3     Apr
4     May
5     Jun
6     Jul
7     Aug
8     Sep
9     Oct
10    Nov
11    Dec
dtype: category
Categories (12, str): ['Jan' < 'Feb' < 'Mar' < 'Apr' ... 'Sep' < 'Oct' < 'Nov' < 'Dec']

In [51]:
pd.Series(months,
          dtype=pd.ArrowDtype(pa.dictionary(pa.int64(), pa.string())))

0     Jan
1     Feb
2     Mar
3     Apr
4     May
5     Jun
6     Jul
7     Aug
8     Sep
9     Oct
10    Nov
11    Dec
dtype: dictionary<values=string, indices=int64, ordered=0>[pyarrow]

In [None]:
### Date & Times