# Setup

In [1]:
# Verify pandas version is 1.5.3
import pandas
pandas.__version__

'1.5.3'

In [2]:
# Run the following command if the pandas version is not 1.5.3
# pip install pandas==1.5.3

In [3]:
# Verify bigframes version is 0.23.0
import bigframes
bigframes.__version__

'0.23.0'

In [4]:
# Run the following command if the bigframes version is not 0.23.0
# pip install bigframes==0.23.0

In [5]:
import pandas as pandas
import pyarrow as pa
import numpy as np

PROJECT_ID = 'bigframes-bugbash-pandas'
REGION = 'us'

import bigframes.pandas as pd
# pd.options.display.progress_bar = None
pd.options.bigquery.project = PROJECT_ID
pd.options.bigquery.location = REGION

# Cheat sheet

## 1. Object types

There are 3 BigFrames native object types:
1. Dataframe
2. Series
3. Index

In [6]:
# Dataframe - 2 colums by 3 rows with index
pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']}, index=['x', 'y', 'z'])

Unnamed: 0,A,B
x,1,a
y,2,b
z,3,c


In [7]:
# Series - a 3-row string series with index
pd.Series(['a', 'a', 'c'], index=['x', 'y', 'z'])

x    a
y    a
z    c
dtype: string

In [8]:
# Index
pd.Index(['x', 'y', 'z'])

Index(['x', 'y', 'z'], dtype='string')

## 2. Data types

BigFrames currently supports 11 data types (dtype):

|        | BigFrames dtype string | BigQuery data type | corresponding pandas dtype | pyarrow type |
| ------ | ------------ | ----------- | ----------- | ----------- |
|  1     | boolean      | BOOL       | pd.BooleanDtype       | pa.bool_()       |
|  2     | Float64        | FLOAT64        | pd.Float64Dtype        | pa.float64()        |
|  3     | Int64        | INT64        | pd.Int64Dtype        | pa.int64()        |
|  4     | string / string[pyarrow]        | STRING        | pd.StringDtype(storage="pyarrow")        | pa.string()        |
|  5     | timestamp[us, tz=UTC][pyarrow]        | TIMESTAMP        | pd.ArrowDtype(pa.timestamp("us", tz="UTC"))        | pa.timestamp("us", tz="UTC")        |
|  6     | timestamp[us][pyarrow]        | DATETIME        | pd.ArrowDtype(pa.timestamp("us"))        | pa.timestamp("us")        |
|  7     | date32[day][pyarrow]        | DATE        | pd.ArrowDtype(pa.date32())        | pa.date32()        |
|  8     | time64[us][pyarrow]        | TIME        | pd.ArrowDtype(pa.time64("us"))        | pa.time64("us")        |
|  9     | decimal128(38, 9)[pyarrow]        | NUMERIC / DECIMAL        | pd.ArrowDtype(pa.decimal128(38, 9))        | pa.decimal128(38, 9)        |
|  10    | decimal256(76, 38)[pyarrow]        | BIGNUMERIC / BIGDECIMAL        | pd.ArrowDtype(pa.decimal256(76, 38))        | pa.decimal256(76, 38)        |
|  11    | binary[pyarrow]        | BYTES        | pd.ArrowDtype(pa.binary())        | pa.binary()        |

In [9]:
# BOOL
pd.Series([True, False, True], dtype=pd.BooleanDtype())

0     True
1    False
2     True
dtype: boolean

In [10]:
# FLOAT64
pd.Series([3.1415926535, 2.343512341324, 1321341234.413241234])

0             3.141593
1             2.343512
2    1321341234.413241
dtype: Float64

In [11]:
# INT64
pd.Series([14342314, 3412341, 754, 626546437654754, 8])

0           14342314
1            3412341
2                754
3    626546437654754
4                  8
dtype: Int64

In [12]:
# STRING
pd.Series(['apple', 'banana', 'cherry'])

0     apple
1    banana
2    cherry
dtype: string

In [13]:
# TIMESTAMP
pd.to_datetime(
    [6967025221020, 969611531202, 1356916312000],
    unit='ms'
).astype(pd.ArrowDtype(pa.timestamp("us",tz='UTC')))

0    2190-10-10 20:47:01.020000+00:00
1    2000-09-22 08:32:11.202000+00:00
2           2012-12-31 01:11:52+00:00
Name: 0, dtype: timestamp[us, tz=UTC][pyarrow]

In [14]:
# DATETIME
pd.to_datetime(
    [6967025221020, 969611531202, 1356916312000],
    unit='ms'
).astype(pd.ArrowDtype(pa.timestamp("us")))

0    2190-10-10 20:47:01.020000
1    2000-09-22 08:32:11.202000
2           2012-12-31 01:11:52
Name: 0, dtype: timestamp[us][pyarrow]

In [15]:
# DATE
pd.to_datetime(
    [6967025221020, 969611531202, 1356916312000],
    unit='ms'
).astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas().astype(pd.ArrowDtype(pa.date32()))

0    2190-10-10
1    2000-09-22
2    2012-12-31
Name: 0, dtype: date32[day][pyarrow]

In [16]:
# TIME
pd.to_datetime(
    [6967025221020, 969611531202, 1356916312000],
    unit='ms'
).astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas().astype(pd.ArrowDtype(pa.time64("us")))

0    20:47:01.020000
1    08:32:11.202000
2           01:11:52
Name: 0, dtype: time64[us][pyarrow]

In [17]:
# NUMERIC / DECIMAL
pd.Series([3.1415926535, 2.343512341324, 1321341234.413241234]).astype(pd.ArrowDtype(pa.decimal128(38, 9)))

0             3.141592654
1             2.343512341
2    1321341234.413241148
dtype: decimal128(38, 9)[pyarrow]

In [18]:
# BIGNUMERIC / BIGDECIMAL
pd.Series([3.1415926535, 2.343512341324, 1321341234.413241234]).astype(pd.ArrowDtype(pa.decimal256(76, 38)))

0             3.14159265350000005412312020780518651009
1             2.34351234132400021437092618725728243589
2    1321341234.41324114799499511718750000000000000000
dtype: decimal256(76, 38)[pyarrow]

In [19]:
# BYTES
pd.Series([b'apple', b'banana', b'cherry'])

0     b'apple'
1    b'banana'
2    b'cherry'
dtype: binary[pyarrow]

## 3. Corner cases

When testing the BigFrames pandas API functions, consider the following corner cases:
1. Empty inputs
2. Duplicate inputs (e.g. duplicate index value / column name)
3. Mixed dtype inputs
4. Mixed data structure inputs (e.g. array + dict)
5. Multiple (dependent/related) function parameters
6. Missing/NaN Values
7. Extreme/boundary Values
8. Errors cases and Exceptions


In [20]:
# Empty dataframe
pd.DataFrame()

In [21]:
# Dataframe with duplicate column name
data1 = {'level1': {'A': [1, 2], 'B': [3, 4]}}
data2 = {'level1': {'C': [5], 'D': [6, 7]}}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
pd.concat([df1, df2], axis=1)

Unnamed: 0,level1,level1.1
A,"[1, 2]",[]
B,"[3, 4]",[]
C,[],[5]
D,[],"[6, 7]"


In [22]:
# Dataframe with null values
pd.DataFrame({'A': [1, 2, np.nan], 'B': ['a', 'b', 'c']}, index=['x', 'y', 'z'])

Unnamed: 0,A,B
x,1.0,a
y,2.0,b
z,,c


# Your test cases