# String Manipulation

### Loading Libraries

In [1]:
# ZipFiles
import zipfile

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import polars as pl
import polars.selectors as cs

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# XGBoost
import xgboost as xgb

### Loading Data: `Twitter Data`

In [2]:
url = 'https://github.com/mattharrison/datasets/raw/' \
'master/data/__mharrison__2020-2021.csv'

In [3]:
raw = pl.read_csv(url)

In [4]:
print(raw)

shape: (5_791, 40)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Tweet id  ┆ Tweet     ┆ Tweet     ┆ time      ┆ … ┆ promoted  ┆ promoted  ┆ promoted  ┆ promoted │
│ ---       ┆ permalink ┆ text      ┆ ---       ┆   ┆ email     ┆ dial      ┆ media     ┆ media    │
│ i64       ┆ ---       ┆ ---       ┆ str       ┆   ┆ tweet     ┆ phone     ┆ views     ┆ engageme │
│           ┆ str       ┆ str       ┆           ┆   ┆ ---       ┆ ---       ┆ ---       ┆ nts      │
│           ┆           ┆           ┆           ┆   ┆ str       ┆ str       ┆ str       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 121258051 ┆ https://t ┆ Sounds    ┆ 2020-01-0 ┆ … ┆ null      ┆ null      ┆ null      ┆ null     │
│ 790578073 ┆ witter.co ┆ like a    ┆ 2 03:44:0 ┆   ┆           ┆       

#### Tweaking Tweet Function:

In [5]:
def tweak_twit(df):
    return (df
            .select(['Tweet id', 'Tweet permalink', 'Tweet text', 'time', 
                     'impressions', 'engagements', 'engagement rate',
                     'retweets', 'replies', 'likes', 'user profile clicks'])
           )

In [6]:
twit = tweak_twit(raw)

In [7]:
print(twit)

shape: (5_791, 11)
┌─────────────┬─────────────┬─────────────┬─────────────┬───┬──────────┬─────────┬───────┬─────────┐
│ Tweet id    ┆ Tweet       ┆ Tweet text  ┆ time        ┆ … ┆ retweets ┆ replies ┆ likes ┆ user    │
│ ---         ┆ permalink   ┆ ---         ┆ ---         ┆   ┆ ---      ┆ ---     ┆ ---   ┆ profile │
│ i64         ┆ ---         ┆ str         ┆ str         ┆   ┆ f64      ┆ f64     ┆ f64   ┆ clicks  │
│             ┆ str         ┆             ┆             ┆   ┆          ┆         ┆       ┆ ---     │
│             ┆             ┆             ┆             ┆   ┆          ┆         ┆       ┆ f64     │
╞═════════════╪═════════════╪═════════════╪═════════════╪═══╪══════════╪═════════╪═══════╪═════════╡
│ 12125805179 ┆ https://twi ┆ Sounds like ┆ 2020-01-02  ┆ … ┆ 0.0      ┆ 0.0     ┆ 3.0   ┆ 3.0     │
│ 05780737    ┆ tter.com/__ ┆ a great     ┆ 03:44:00+00 ┆   ┆          ┆         ┆       ┆         │
│             ┆ mharriso…   ┆ topic! htt… ┆ :00         ┆   ┆          ┆

### Data Validation

Some accessor already are deprecated. 

In [8]:
col = pl.col('Tweet permalink')

In [9]:
print([m for m in dir(col.str)
      if not m.startswith('_')])

['concat', 'contains', 'contains_any', 'count_matches', 'decode', 'encode', 'ends_with', 'escape_regex', 'explode', 'extract', 'extract_all', 'extract_groups', 'extract_many', 'find', 'find_many', 'head', 'join', 'json_decode', 'json_path_match', 'len_bytes', 'len_chars', 'pad_end', 'pad_start', 'replace', 'replace_all', 'replace_many', 'reverse', 'slice', 'split', 'split_exact', 'splitn', 'starts_with', 'strip_chars', 'strip_chars_end', 'strip_chars_start', 'strip_prefix', 'strip_suffix', 'strptime', 'tail', 'to_date', 'to_datetime', 'to_decimal', 'to_integer', 'to_lowercase', 'to_time', 'to_titlecase', 'to_uppercase', 'zfill']


In [10]:
# Checking overlaps
print(sorted([m for m in
             set(dir(col.str)) & set(dir(''))
             if not m.startswith('_')]))

['encode', 'find', 'join', 'replace', 'split', 'zfill']


In [11]:
print(sorted([m for m in
             set(dir(col.str)) - set(dir(''))
             if not m.startswith('_')]))

['concat', 'contains', 'contains_any', 'count_matches', 'decode', 'ends_with', 'escape_regex', 'explode', 'extract', 'extract_all', 'extract_groups', 'extract_many', 'find_many', 'head', 'json_decode', 'json_path_match', 'len_bytes', 'len_chars', 'pad_end', 'pad_start', 'replace_all', 'replace_many', 'reverse', 'slice', 'split_exact', 'splitn', 'starts_with', 'strip_chars', 'strip_chars_end', 'strip_chars_start', 'strip_prefix', 'strip_suffix', 'strptime', 'tail', 'to_date', 'to_datetime', 'to_decimal', 'to_integer', 'to_lowercase', 'to_time', 'to_titlecase', 'to_uppercase']


### URL `Tweet permalink` Validation

In [12]:
'https://metasnake.com'.startswith('https://twitter.com')

False

In [13]:
print(twit
      .filter(~col.str.starts_with('https://twitter.com/'))
     )

shape: (0, 11)
┌──────────┬───────────┬────────────┬──────┬───┬──────────┬─────────┬───────┬──────────────────────┐
│ Tweet id ┆ Tweet     ┆ Tweet text ┆ time ┆ … ┆ retweets ┆ replies ┆ likes ┆ user profile clicks  │
│ ---      ┆ permalink ┆ ---        ┆ ---  ┆   ┆ ---      ┆ ---     ┆ ---   ┆ ---                  │
│ i64      ┆ ---       ┆ str        ┆ str  ┆   ┆ f64      ┆ f64     ┆ f64   ┆ f64                  │
│          ┆ str       ┆            ┆      ┆   ┆          ┆         ┆       ┆                      │
╞══════════╪═══════════╪════════════╪══════╪═══╪══════════╪═════════╪═══════╪══════════════════════╡
└──────────┴───────────┴────────────┴──────┴───┴──────────┴─────────┴───────┴──────────────────────┘


### Extracting The Username

In [14]:
print('https://metasnake.com/effective-polars'.split('/'))

['https:', '', 'metasnake.com', 'effective-polars']


In [15]:
print(twit
      .select(col.str.split('/'))
     )

shape: (5_791, 1)
┌─────────────────────────────────┐
│ Tweet permalink                 │
│ ---                             │
│ list[str]                       │
╞═════════════════════════════════╡
│ ["https:", "", … "121258051790… │
│ ["https:", "", … "121258249482… │
│ ["https:", "", … "121261373569… │
│ ["https:", "", … "121291174961… │
│ ["https:", "", … "121292055602… │
│ …                               │
│ ["https:", "", … "147530066185… │
│ ["https:", "", … "147551814369… │
│ ["https:", "", … "147589144124… │
│ ["https:", "", … "147645381975… │
│ ["https:", "", … "147702472205… │
└─────────────────────────────────┘


In [16]:
print([m for m in dir(col.list)
       if not m.startswith('_')])

['all', 'any', 'arg_max', 'arg_min', 'concat', 'contains', 'count_matches', 'diff', 'drop_nulls', 'eval', 'explode', 'first', 'gather', 'gather_every', 'get', 'head', 'join', 'last', 'len', 'max', 'mean', 'median', 'min', 'n_unique', 'reverse', 'sample', 'set_difference', 'set_intersection', 'set_symmetric_difference', 'set_union', 'shift', 'slice', 'sort', 'std', 'sum', 'tail', 'to_array', 'to_struct', 'unique', 'var']


In [17]:
print(twit
      .select(col.str.split('/')
              .list.len())
     )

shape: (5_791, 1)
┌─────────────────┐
│ Tweet permalink │
│ ---             │
│ u32             │
╞═════════════════╡
│ 6               │
│ 6               │
│ 6               │
│ 6               │
│ 6               │
│ …               │
│ 6               │
│ 6               │
│ 6               │
│ 6               │
│ 6               │
└─────────────────┘


In [18]:
print(twit
      .select(col.str.split('/')
              .list.to_struct())
     )

shape: (5_791, 1)
┌─────────────────────────────────┐
│ Tweet permalink                 │
│ ---                             │
│ struct[6]                       │
╞═════════════════════════════════╡
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ …                               │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
└─────────────────────────────────┘


In [19]:
# Converting `struct` into columns with `.unnest`
print(twit
      .select(col.str.split('/')
             .list.to_struct())
      .unnest('Tweet permalink')
     )

shape: (5_791, 6)
┌─────────┬─────────┬─────────────┬───────────────┬─────────┬─────────────────────┐
│ field_0 ┆ field_1 ┆ field_2     ┆ field_3       ┆ field_4 ┆ field_5             │
│ ---     ┆ ---     ┆ ---         ┆ ---           ┆ ---     ┆ ---                 │
│ str     ┆ str     ┆ str         ┆ str           ┆ str     ┆ str                 │
╞═════════╪═════════╪═════════════╪═══════════════╪═════════╪═════════════════════╡
│ https:  ┆         ┆ twitter.com ┆ __mharrison__ ┆ status  ┆ 1212580517905780737 │
│ https:  ┆         ┆ twitter.com ┆ __mharrison__ ┆ status  ┆ 1212582494828036097 │
│ https:  ┆         ┆ twitter.com ┆ __mharrison__ ┆ status  ┆ 1212613735698690049 │
│ https:  ┆         ┆ twitter.com ┆ __mharrison__ ┆ status  ┆ 1212911749617242113 │
│ https:  ┆         ┆ twitter.com ┆ __mharrison__ ┆ status  ┆ 1212920556028252160 │
│ …       ┆ …       ┆ …           ┆ …             ┆ …       ┆ …                   │
│ https:  ┆         ┆ twitter.com ┆ __mharrison__ ┆ status

In [20]:
print(twit
      .select(col.str.split('/')
              .list.to_struct())
      .unnest('Tweet permalink')
      .to_struct()
     )

shape: (5_791,)
Series: '' [struct[6]]
[
	{"https:","","twitter.com","__mharrison__","status","1212580517905780737"}
	{"https:","","twitter.com","__mharrison__","status","1212582494828036097"}
	{"https:","","twitter.com","__mharrison__","status","1212613735698690049"}
	{"https:","","twitter.com","__mharrison__","status","1212911749617242113"}
	{"https:","","twitter.com","__mharrison__","status","1212920556028252160"}
	…
	{"https:","","twitter.com","__mharrison__","status","1475300661851934721"}
	{"https:","","twitter.com","__mharrison__","status","1475518143690801156"}
	{"https:","","twitter.com","__mharrison__","status","1475891441243025408"}
	{"https:","","twitter.com","__mharrison__","status","1476453819751878656"}
	{"https:","","twitter.com","__mharrison__","status","1477024722051158018"}
]


In [21]:
print(twit
      .select(col.str.split('/')
              .list.join('/')
             )
     )

shape: (5_791, 1)
┌─────────────────────────────────┐
│ Tweet permalink                 │
│ ---                             │
│ str                             │
╞═════════════════════════════════╡
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ …                               │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
└─────────────────────────────────┘


In [22]:
print(twit
      .select(col.str.split('/')
              .list.to_struct())
      .select(pl.all().map_elements(lambda elem: list(elem)))          
     )

shape: (5_791, 1)
┌─────────────────────────────────┐
│ Tweet permalink                 │
│ ---                             │
│ list[str]                       │
╞═════════════════════════════════╡
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ …                               │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
└─────────────────────────────────┘


  .select(pl.all().map_elements(lambda elem: list(elem)))


In [23]:
print(twit
      .with_columns(username=col.str.split('/')
                    .list.get(3))
     )

shape: (5_791, 12)
┌────────────┬────────────┬────────────┬────────────┬───┬─────────┬───────┬────────────┬───────────┐
│ Tweet id   ┆ Tweet      ┆ Tweet text ┆ time       ┆ … ┆ replies ┆ likes ┆ user       ┆ username  │
│ ---        ┆ permalink  ┆ ---        ┆ ---        ┆   ┆ ---     ┆ ---   ┆ profile    ┆ ---       │
│ i64        ┆ ---        ┆ str        ┆ str        ┆   ┆ f64     ┆ f64   ┆ clicks     ┆ str       │
│            ┆ str        ┆            ┆            ┆   ┆         ┆       ┆ ---        ┆           │
│            ┆            ┆            ┆            ┆   ┆         ┆       ┆ f64        ┆           │
╞════════════╪════════════╪════════════╪════════════╪═══╪═════════╪═══════╪════════════╪═══════════╡
│ 1212580517 ┆ https://tw ┆ Sounds     ┆ 2020-01-02 ┆ … ┆ 0.0     ┆ 3.0   ┆ 3.0        ┆ __mharris │
│ 905780737  ┆ itter.com/ ┆ like a     ┆ 03:44:00+0 ┆   ┆         ┆       ┆            ┆ on__      │
│            ┆ __mharriso ┆ great      ┆ 0:00       ┆   ┆         ┆     

In [24]:
print(twit
      .with_columns(username=col.str.split('/')
                    .list[3])
     )

shape: (5_791, 12)
┌────────────┬────────────┬────────────┬────────────┬───┬─────────┬───────┬────────────┬───────────┐
│ Tweet id   ┆ Tweet      ┆ Tweet text ┆ time       ┆ … ┆ replies ┆ likes ┆ user       ┆ username  │
│ ---        ┆ permalink  ┆ ---        ┆ ---        ┆   ┆ ---     ┆ ---   ┆ profile    ┆ ---       │
│ i64        ┆ ---        ┆ str        ┆ str        ┆   ┆ f64     ┆ f64   ┆ clicks     ┆ str       │
│            ┆ str        ┆            ┆            ┆   ┆         ┆       ┆ ---        ┆           │
│            ┆            ┆            ┆            ┆   ┆         ┆       ┆ f64        ┆           │
╞════════════╪════════════╪════════════╪════════════╪═══╪═════════╪═══════╪════════════╪═══════════╡
│ 1212580517 ┆ https://tw ┆ Sounds     ┆ 2020-01-02 ┆ … ┆ 0.0     ┆ 3.0   ┆ 3.0        ┆ __mharris │
│ 905780737  ┆ itter.com/ ┆ like a     ┆ 03:44:00+0 ┆   ┆         ┆       ┆            ┆ on__      │
│            ┆ __mharriso ┆ great      ┆ 0:00       ┆   ┆         ┆     

### Extract The Username with a Regular Expression

In [25]:
regex = r'^https:\/\/twitter\.com\/([a-zA-Z0-9_]+)\/status\/(\d+)$'

In [26]:
print(twit
      .select(user=col.str.extract(regex, group_index=1))
     )

shape: (5_791, 1)
┌───────────────┐
│ user          │
│ ---           │
│ str           │
╞═══════════════╡
│ __mharrison__ │
│ __mharrison__ │
│ __mharrison__ │
│ __mharrison__ │
│ __mharrison__ │
│ …             │
│ __mharrison__ │
│ __mharrison__ │
│ __mharrison__ │
│ __mharrison__ │
│ __mharrison__ │
└───────────────┘


### Counting Words & Mentions

In [27]:
tweet_col = pl.col('Tweet text')

In [28]:
print(twit
      .select(tweet_col.str.split(' '))
     )

shape: (5_791, 1)
┌─────────────────────────────────┐
│ Tweet text                      │
│ ---                             │
│ list[str]                       │
╞═════════════════════════════════╡
│ ["Sounds", "like", … "https://… │
│ ["@FogleBird", "Looks", … "🏠"] │
│ ["@afilina", "That's", … "🤔"]  │
│ ["@randal_olson", "I", … "🐍"]  │
│ ["@AlSweigart", "Sometimes", …… │
│ …                               │
│ ["@allison_horst", "That's", "… │
│ ["@willmcgugan", "You", … "scr… │
│ ["@posco", "Visiting", …        │
│ "🌴😉"]                         │
│ ["@johndsaunders", "My", … "th… │
│ ["@tunguz", "Xgboost"]          │
└─────────────────────────────────┘


In [29]:
print(twit
      .select(tweet_col.str.split(' ').list.len())
     )

shape: (5_791, 1)
┌────────────┐
│ Tweet text │
│ ---        │
│ u32        │
╞════════════╡
│ 6          │
│ 9          │
│ 33         │
│ 28         │
│ 18         │
│ …          │
│ 3          │
│ 24         │
│ 35         │
│ 6          │
│ 2          │
└────────────┘


In [30]:
print(twit
      .with_columns(word_count=tweet_col.str.split(' ').list.len())
     )

shape: (5_791, 12)
┌────────────┬────────────┬────────────┬────────────┬───┬─────────┬───────┬────────────┬───────────┐
│ Tweet id   ┆ Tweet      ┆ Tweet text ┆ time       ┆ … ┆ replies ┆ likes ┆ user       ┆ word_coun │
│ ---        ┆ permalink  ┆ ---        ┆ ---        ┆   ┆ ---     ┆ ---   ┆ profile    ┆ t         │
│ i64        ┆ ---        ┆ str        ┆ str        ┆   ┆ f64     ┆ f64   ┆ clicks     ┆ ---       │
│            ┆ str        ┆            ┆            ┆   ┆         ┆       ┆ ---        ┆ u32       │
│            ┆            ┆            ┆            ┆   ┆         ┆       ┆ f64        ┆           │
╞════════════╪════════════╪════════════╪════════════╪═══╪═════════╪═══════╪════════════╪═══════════╡
│ 1212580517 ┆ https://tw ┆ Sounds     ┆ 2020-01-02 ┆ … ┆ 0.0     ┆ 3.0   ┆ 3.0        ┆ 6         │
│ 905780737  ┆ itter.com/ ┆ like a     ┆ 03:44:00+0 ┆   ┆         ┆       ┆            ┆           │
│            ┆ __mharriso ┆ great      ┆ 0:00       ┆   ┆         ┆     

In [31]:
print(twit
      .select(tweet_col.str.split(' ').list.eval(
          pl.element().str.starts_with('@')))
     )

shape: (5_791, 1)
┌─────────────────────────┐
│ Tweet text              │
│ ---                     │
│ list[bool]              │
╞═════════════════════════╡
│ [false, false, … false] │
│ [true, false, … false]  │
│ [true, false, … false]  │
│ [false, false, … false] │
│ [true, false, … false]  │
│ …                       │
│ [false, false, false]   │
│ [true, false, … false]  │
│ [true, false, … false]  │
│ [false, false, … false] │
│ [true, false]           │
└─────────────────────────┘


In [32]:
print(twit
      .select(tweet_col.str.split(' ')
              .list.eval(pl.element().str.starts_with('@'))
              .list.sum())
     )

shape: (5_791, 1)
┌────────────┐
│ Tweet text │
│ ---        │
│ u32        │
╞════════════╡
│ 0          │
│ 1          │
│ 1          │
│ 0          │
│ 1          │
│ …          │
│ 0          │
│ 1          │
│ 1          │
│ 0          │
│ 1          │
└────────────┘


In [33]:
# Getting DataFrame with New Columns
print(twit
      .with_columns(word_count=tweet_col.str.split(' ').list.len(),
                    num_mentions=tweet_col.str.split(' ')
                    .list.eval(pl.element().str.starts_with('@'))
                    .list.sum())
     )

shape: (5_791, 13)
┌────────────┬────────────┬────────────┬───────────┬───┬───────┬───────────┬───────────┬───────────┐
│ Tweet id   ┆ Tweet      ┆ Tweet text ┆ time      ┆ … ┆ likes ┆ user      ┆ word_coun ┆ num_menti │
│ ---        ┆ permalink  ┆ ---        ┆ ---       ┆   ┆ ---   ┆ profile   ┆ t         ┆ ons       │
│ i64        ┆ ---        ┆ str        ┆ str       ┆   ┆ f64   ┆ clicks    ┆ ---       ┆ ---       │
│            ┆ str        ┆            ┆           ┆   ┆       ┆ ---       ┆ u32       ┆ u32       │
│            ┆            ┆            ┆           ┆   ┆       ┆ f64       ┆           ┆           │
╞════════════╪════════════╪════════════╪═══════════╪═══╪═══════╪═══════════╪═══════════╪═══════════╡
│ 1212580517 ┆ https://tw ┆ Sounds     ┆ 2020-01-0 ┆ … ┆ 3.0   ┆ 3.0       ┆ 6         ┆ 0         │
│ 905780737  ┆ itter.com/ ┆ like a     ┆ 2 03:44:0 ┆   ┆       ┆           ┆           ┆           │
│            ┆ __mharriso ┆ great      ┆ 0+00:00   ┆   ┆       ┆        

### Checking for Emojis

In [35]:
non_ascii = r'[^\x00-\x7F]'

tweet_col = pl.col('Tweet text')

In [36]:
print(twit
      .select(tweet_col, has_emoji=tweet_col.str.contains(non_ascii))
     )

shape: (5_791, 2)
┌─────────────────────────────────┬───────────┐
│ Tweet text                      ┆ has_emoji │
│ ---                             ┆ ---       │
│ str                             ┆ bool      │
╞═════════════════════════════════╪═══════════╡
│ Sounds like a great topic! htt… ┆ false     │
│ @FogleBird Looks like SLC. I c… ┆ true      │
│ @afilina That's really amount … ┆ true      │
│ @randal_olson I use anaconda w… ┆ true      │
│ @AlSweigart Sometimes the stud… ┆ false     │
│ …                               ┆ …         │
│ @allison_horst That's awesome!  ┆ false     │
│ @willmcgugan You need to find … ┆ false     │
│ @posco Visiting Hawaii for the… ┆ true      │
│ @johndsaunders My son just bui… ┆ false     │
│ @tunguz Xgboost                 ┆ false     │
└─────────────────────────────────┴───────────┘


In [38]:
# Looking for Correlations between Emojis and Engagement Number
print(twit
      .select('engagements',
              tweet_len=tweet_col.str.split(' ').list.len(),
              has_emoji=tweet_col.str.contains(non_ascii))
      .corr()
     )

shape: (3, 3)
┌─────────────┬───────────┬───────────┐
│ engagements ┆ tweet_len ┆ has_emoji │
│ ---         ┆ ---       ┆ ---       │
│ f64         ┆ f64       ┆ f64       │
╞═════════════╪═══════════╪═══════════╡
│ 1.0         ┆ 0.057359  ┆ 0.037055  │
│ 0.057359    ┆ 1.0       ┆ 0.045215  │
│ 0.037055    ┆ 0.045215  ┆ 1.0       │
└─────────────┴───────────┴───────────┘


In [40]:
# Jitter Function()
def jitter(col, amount=.5):
    return col + np.random.uniform(-amount, amount, len(col))   

In [49]:
# Scatterplotting Inference
# (twit
#  .select('engagements', 
#          has_emoji=tweet_col.str.contains(non_ascii).cast(pl.Int8))
#  .pipe(lambda df: df.with_columns(jitter(df['has_emoji'], amount=.4)))
#  .plot.scatter(x='engagements', y='has_emoji', alpha=.1)
# )

### Plotting Trends

In [50]:
print(twit
      .select('time', 'engagements', tweet_col,
              reply=tweet_col.str.starts_with('@'))
     )

shape: (5_791, 4)
┌───────────────────────────┬─────────────┬─────────────────────────────────┬───────┐
│ time                      ┆ engagements ┆ Tweet text                      ┆ reply │
│ ---                       ┆ ---         ┆ ---                             ┆ ---   │
│ str                       ┆ f64         ┆ str                             ┆ bool  │
╞═══════════════════════════╪═════════════╪═════════════════════════════════╪═══════╡
│ 2020-01-02 03:44:00+00:00 ┆ 7.0         ┆ Sounds like a great topic! htt… ┆ false │
│ 2020-01-02 03:52:00+00:00 ┆ 3.0         ┆ @FogleBird Looks like SLC. I c… ┆ false │
│ 2020-01-02 05:56:00+00:00 ┆ 6.0         ┆ @afilina That's really amount … ┆ false │
│ 2020-01-03 01:41:00+00:00 ┆ 14.0        ┆ @randal_olson I use anaconda w… ┆ false │
│ 2020-01-03 02:16:00+00:00 ┆ 1.0         ┆ @AlSweigart Sometimes the stud… ┆ false │
│ …                         ┆ …           ┆ …                               ┆ …     │
│ 2021-12-27 03:01:00+00:00 ┆ 1.0   

In [52]:
# Switching DateTime Column from String to Date
print(twit
      .select(pl.col('time').str.to_datetime('%Y-%m-%d %H:%M:%S%z'), 'engagements', reply=tweet_col.str.starts_with('@'))
      .pivot(index='time', on='reply', values='engagements', aggregate_function='sum')
      .rename({'false': 'original', 'true': 'reply'})
     )

shape: (5_413, 3)
┌─────────────────────────┬──────────┬───────┐
│ time                    ┆ original ┆ reply │
│ ---                     ┆ ---      ┆ ---   │
│ datetime[μs, UTC]       ┆ f64      ┆ f64   │
╞═════════════════════════╪══════════╪═══════╡
│ 2020-01-02 03:44:00 UTC ┆ 7.0      ┆ null  │
│ 2020-01-02 03:52:00 UTC ┆ 3.0      ┆ null  │
│ 2020-01-02 05:56:00 UTC ┆ 6.0      ┆ null  │
│ 2020-01-03 01:41:00 UTC ┆ 14.0     ┆ null  │
│ 2020-01-03 02:16:00 UTC ┆ 1.0      ┆ null  │
│ …                       ┆ …        ┆ …     │
│ 2021-12-27 03:01:00 UTC ┆ 1.0      ┆ null  │
│ 2021-12-27 17:25:00 UTC ┆ 7.0      ┆ null  │
│ 2021-12-28 18:08:00 UTC ┆ 12.0     ┆ null  │
│ 2021-12-30 07:23:00 UTC ┆ 8.0      ┆ null  │
│ 2021-12-31 21:11:00 UTC ┆ 63.0     ┆ null  │
└─────────────────────────┴──────────┴───────┘


In [57]:
# (twit
#  .select(pl.col('time').str.to_datetime('%Y-%m-%d %H:%M:%S%z'), 'engagements', reply=tweet_col.str.starts_with('@'))
#  .pivot(index='time', on='reply', values='engagements', aggregate_function='sum')
#  .rename({'false': 'original', 'true': 'reply'})
#  .plot.line(x='time', y=['original', 'reply'])
# )

In [67]:
(twit
 .select(pl.col('time').str.to_datetime('%Y-%m-%d %H:%M:%S%z'), 'engagements', reply=tweet_col.str.starts_with('@'))
 .pivot(index='time', on='reply', values='engagements', aggregate_function='sum')
 .set_sorted('time').group_by_dynamic('time', every='1w')
 .agg(pl.col('true', 'false').mean())
 .rename({'false': 'original', 'true': 'reply'})
 .unpivot(index='time', on=['original', 'reply'], variable_name='type', value_name='engagements')
 .plot.line(x='time', y='engagements', color='type')
 .properties(title='Engagements by Reply Type')
)