In [None]:
!pip install polars

In [3]:
import polars as pl

### 1. Polars Data Types

Polars supports a variety of data types for storing and processing different kinds of data efficiently.

**Basic Data Types:**
- Integer Types: `Int8`, `Int16`, `Int32`, `Int64`
- Unsigned Integer Types: `UInt8`, `UInt16`, `UInt32`, `UInt64`
- Floating Point Types: `Float32`, `Float64`
- Boolean Type: `Boolean`
- String Type: `Utf8` (Unicode string)
- Datetime Types: `Datetime`, `Date`, `Time`
- Categorical Type: `Categorical` (to save memory in case of repeated string values)
- List Type: `List` (used to store arrays inside a single column)
- Object Type: `Object` (used for any Python objects)

**Example: Creating a DataFrame with Various Types**

In [5]:
import datetime as dt

df = pl.DataFrame({
    'integers': [1, 2, 3],
    'floats': [1.1, 2.2, 3.3],
    'booleans': [True, False, True],
    'strings': ['apple', 'banana', 'cherry'],
    'dates': [dt.datetime(2023, 1, 1), dt.datetime(2023, 1, 2), dt.datetime(2023, 1, 3)]
})

print(df)

shape: (3, 5)
┌──────────┬────────┬──────────┬─────────┬─────────────────────┐
│ integers ┆ floats ┆ booleans ┆ strings ┆ dates               │
│ ---      ┆ ---    ┆ ---      ┆ ---     ┆ ---                 │
│ i64      ┆ f64    ┆ bool     ┆ str     ┆ datetime[μs]        │
╞══════════╪════════╪══════════╪═════════╪═════════════════════╡
│ 1        ┆ 1.1    ┆ true     ┆ apple   ┆ 2023-01-01 00:00:00 │
│ 2        ┆ 2.2    ┆ false    ┆ banana  ┆ 2023-01-02 00:00:00 │
│ 3        ┆ 3.3    ┆ true     ┆ cherry  ┆ 2023-01-03 00:00:00 │
└──────────┴────────┴──────────┴─────────┴─────────────────────┘


### 2. Type Detection and Checking

You can check the types of columns in a DataFrame using `.dtype` or `.dtypes`.

**Example: Checking Column Data Types**

In [6]:
df = pl.DataFrame({
    'ints': [1, 2, 3],
    'floats': [1.1, 2.2, 3.3],
    'strings': ['a', 'b', 'c']
})

# Get column types
print(df.dtypes)

[Int64, Float64, String]


### 3. Type Conversion in Polars

You may need to convert data types for various reasons, such as:

- To ensure compatibility when performing operations.
- To optimize memory usage.
- To manipulate date, time, or categorical data more efficiently.

**Casting Columns to Different Types**

Polars allows you to cast or convert a column from one type to another using the `.cast()` method.

**Example: Converting Data Types**

In [9]:
df = pl.DataFrame({
    'numbers_as_strings': ['1', '2', '3']
})

print('Column Type: ' + str(df['numbers_as_strings'].dtype))

# Cast the string column to integer type
df = df.with_columns(
    pl.col('numbers_as_strings').cast(pl.Int64).alias('numbers_as_integers')
)

print(df)

Column Type: String
shape: (3, 2)
┌────────────────────┬─────────────────────┐
│ numbers_as_strings ┆ numbers_as_integers │
│ ---                ┆ ---                 │
│ str                ┆ i64                 │
╞════════════════════╪═════════════════════╡
│ 1                  ┆ 1                   │
│ 2                  ┆ 2                   │
│ 3                  ┆ 3                   │
└────────────────────┴─────────────────────┘


In [10]:
import datetime as dt

# A dataframe using datetime
df_using_datetime = pl.DataFrame({
    'event_date': [dt.date(2023, 1, 1), dt.date(2023, 2, 1), dt.date(2023, 3, 1)],  # Date Type
    'event_time': [dt.time(14, 30), dt.time(16, 45), dt.time(9, 0)],  # Time Type
    'event_datetime': [
        dt.datetime(2023, 1, 1, 14, 30),
        dt.datetime(2023, 2, 1, 16, 45),
        dt.datetime(2023, 3, 1, 9, 0)
    ]  # Datetime Type
})

# The extraction works because it is in the correct format
df_using_datetime = df_using_datetime.with_columns(
    pl.col('event_date').dt.day().alias('day'),
    pl.col('event_date').dt.month().alias('month'),
    pl.col('event_date').dt.year().alias('year'),
    pl.col('event_time').dt.hour().alias('hour'),
    pl.col('event_time').dt.minute().alias('minute')
)
print(df_using_datetime.dtypes)
print(df_using_datetime)

# ----------

# A dataframe using strings
df_using_strings = pl.DataFrame({
    'event_date': ['2023-01-01', '2023-02-01', '2023-03-01'],
    'event_time': ['14:30:00', '16:45:00', '09:00:00'],
    'event_datetime': [
        '2023-01-01 14:30:00',
        '2023-02-01 16:45:00',
        '2023-03-01 09:00:00',
    ]  # Datetime Type
})

# It's necessary to convert
df_using_strings = df_using_strings.with_columns(
    pl.col('event_date').str.to_date('%Y-%m-%d'),
    pl.col('event_time').str.to_time('%H:%M:%S'),
    pl.col('event_datetime').str.to_datetime('%Y-%m-%d %H:%M:%S'),
)

# The extraction works because now, after the conversion, it is in the correct format
df_using_strings = df_using_strings.with_columns(
    pl.col('event_date').dt.day().alias('day'),
    pl.col('event_date').dt.month().alias('month'),
    pl.col('event_date').dt.year().alias('year'),
    pl.col('event_time').dt.hour().alias('hour'),
    pl.col('event_time').dt.minute().alias('minute')
)
print(df_using_strings.dtypes)
print(df_using_strings)

[Date, Time, Datetime(time_unit='us', time_zone=None), Int8, Int8, Int32, Int8, Int8]
shape: (3, 8)
┌────────────┬────────────┬─────────────────────┬─────┬───────┬──────┬──────┬────────┐
│ event_date ┆ event_time ┆ event_datetime      ┆ day ┆ month ┆ year ┆ hour ┆ minute │
│ ---        ┆ ---        ┆ ---                 ┆ --- ┆ ---   ┆ ---  ┆ ---  ┆ ---    │
│ date       ┆ time       ┆ datetime[μs]        ┆ i8  ┆ i8    ┆ i32  ┆ i8   ┆ i8     │
╞════════════╪════════════╪═════════════════════╪═════╪═══════╪══════╪══════╪════════╡
│ 2023-01-01 ┆ 14:30:00   ┆ 2023-01-01 14:30:00 ┆ 1   ┆ 1     ┆ 2023 ┆ 14   ┆ 30     │
│ 2023-02-01 ┆ 16:45:00   ┆ 2023-02-01 16:45:00 ┆ 1   ┆ 2     ┆ 2023 ┆ 16   ┆ 45     │
│ 2023-03-01 ┆ 09:00:00   ┆ 2023-03-01 09:00:00 ┆ 1   ┆ 3     ┆ 2023 ┆ 9    ┆ 0      │
└────────────┴────────────┴─────────────────────┴─────┴───────┴──────┴──────┴────────┘
[Date, Time, Datetime(time_unit='us', time_zone=None), Int8, Int8, Int32, Int8, Int8]
shape: (3, 8)
┌────────────┬───