# String Manipulation

### Loading Libraries

In [1]:
# ZipFiles
import zipfile

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import polars as pl
import polars.selectors as cs

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# XGBoost
import xgboost as xgb

### Loading Data: `Twitter Data`

In [2]:
url = 'https://github.com/mattharrison/datasets/raw/' \
'master/data/__mharrison__2020-2021.csv'

In [3]:
raw = pl.read_csv(url)

In [4]:
print(raw)

shape: (5_791, 40)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Tweet id  ┆ Tweet     ┆ Tweet     ┆ time      ┆ … ┆ promoted  ┆ promoted  ┆ promoted  ┆ promoted │
│ ---       ┆ permalink ┆ text      ┆ ---       ┆   ┆ email     ┆ dial      ┆ media     ┆ media    │
│ i64       ┆ ---       ┆ ---       ┆ str       ┆   ┆ tweet     ┆ phone     ┆ views     ┆ engageme │
│           ┆ str       ┆ str       ┆           ┆   ┆ ---       ┆ ---       ┆ ---       ┆ nts      │
│           ┆           ┆           ┆           ┆   ┆ str       ┆ str       ┆ str       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 121258051 ┆ https://t ┆ Sounds    ┆ 2020-01-0 ┆ … ┆ null      ┆ null      ┆ null      ┆ null     │
│ 790578073 ┆ witter.co ┆ like a    ┆ 2 03:44:0 ┆   ┆           ┆       

#### Tweaking Tweet Function:

In [5]:
def tweak_twit(df):
    return (df
            .select(['Tweet id', 'Tweet permalink', 'Tweet text', 'time', 
                     'impressions', 'engagements', 'engagement rate',
                     'retweets', 'replies', 'likes', 'user profile clicks'])
           )

In [6]:
twit = tweak_twit(raw)

In [7]:
print(twit)

shape: (5_791, 11)
┌─────────────┬─────────────┬─────────────┬─────────────┬───┬──────────┬─────────┬───────┬─────────┐
│ Tweet id    ┆ Tweet       ┆ Tweet text  ┆ time        ┆ … ┆ retweets ┆ replies ┆ likes ┆ user    │
│ ---         ┆ permalink   ┆ ---         ┆ ---         ┆   ┆ ---      ┆ ---     ┆ ---   ┆ profile │
│ i64         ┆ ---         ┆ str         ┆ str         ┆   ┆ f64      ┆ f64     ┆ f64   ┆ clicks  │
│             ┆ str         ┆             ┆             ┆   ┆          ┆         ┆       ┆ ---     │
│             ┆             ┆             ┆             ┆   ┆          ┆         ┆       ┆ f64     │
╞═════════════╪═════════════╪═════════════╪═════════════╪═══╪══════════╪═════════╪═══════╪═════════╡
│ 12125805179 ┆ https://twi ┆ Sounds like ┆ 2020-01-02  ┆ … ┆ 0.0      ┆ 0.0     ┆ 3.0   ┆ 3.0     │
│ 05780737    ┆ tter.com/__ ┆ a great     ┆ 03:44:00+00 ┆   ┆          ┆         ┆       ┆         │
│             ┆ mharriso…   ┆ topic! htt… ┆ :00         ┆   ┆          ┆

### Data Validation

Some accessor already are deprecated. 

In [8]:
col = pl.col('Tweet permalink')

In [9]:
print([m for m in dir(col.str)
      if not m.startswith('_')])

['concat', 'contains', 'contains_any', 'count_matches', 'decode', 'encode', 'ends_with', 'escape_regex', 'explode', 'extract', 'extract_all', 'extract_groups', 'extract_many', 'find', 'find_many', 'head', 'join', 'json_decode', 'json_path_match', 'len_bytes', 'len_chars', 'pad_end', 'pad_start', 'replace', 'replace_all', 'replace_many', 'reverse', 'slice', 'split', 'split_exact', 'splitn', 'starts_with', 'strip_chars', 'strip_chars_end', 'strip_chars_start', 'strip_prefix', 'strip_suffix', 'strptime', 'tail', 'to_date', 'to_datetime', 'to_decimal', 'to_integer', 'to_lowercase', 'to_time', 'to_titlecase', 'to_uppercase', 'zfill']


In [11]:
# Checking overlaps
print(sorted([m for m in
             set(dir(col.str)) & set(dir(''))
             if not m.startswith('_')]))

['encode', 'find', 'join', 'replace', 'split', 'zfill']


In [12]:
print(sorted([m for m in
             set(dir(col.str)) - set(dir(''))
             if not m.startswith('_')]))

['concat', 'contains', 'contains_any', 'count_matches', 'decode', 'ends_with', 'escape_regex', 'explode', 'extract', 'extract_all', 'extract_groups', 'extract_many', 'find_many', 'head', 'json_decode', 'json_path_match', 'len_bytes', 'len_chars', 'pad_end', 'pad_start', 'replace_all', 'replace_many', 'reverse', 'slice', 'split_exact', 'splitn', 'starts_with', 'strip_chars', 'strip_chars_end', 'strip_chars_start', 'strip_prefix', 'strip_suffix', 'strptime', 'tail', 'to_date', 'to_datetime', 'to_decimal', 'to_integer', 'to_lowercase', 'to_time', 'to_titlecase', 'to_uppercase']


### URL `Tweet permalink` Validation

In [13]:
'https://metasnake.com'.startswith('https://twitter.com')

False

In [14]:
print(twit
      .filter(~col.str.starts_with('https://twitter.com/'))
     )

shape: (0, 11)
┌──────────┬───────────┬────────────┬──────┬───┬──────────┬─────────┬───────┬──────────────────────┐
│ Tweet id ┆ Tweet     ┆ Tweet text ┆ time ┆ … ┆ retweets ┆ replies ┆ likes ┆ user profile clicks  │
│ ---      ┆ permalink ┆ ---        ┆ ---  ┆   ┆ ---      ┆ ---     ┆ ---   ┆ ---                  │
│ i64      ┆ ---       ┆ str        ┆ str  ┆   ┆ f64      ┆ f64     ┆ f64   ┆ f64                  │
│          ┆ str       ┆            ┆      ┆   ┆          ┆         ┆       ┆                      │
╞══════════╪═══════════╪════════════╪══════╪═══╪══════════╪═════════╪═══════╪══════════════════════╡
└──────────┴───────────┴────────────┴──────┴───┴──────────┴─────────┴───────┴──────────────────────┘


### Extracting The Username

In [15]:
print('https://metasnake.com/effective-polars'.split('/'))

['https:', '', 'metasnake.com', 'effective-polars']


In [17]:
print(twit
      .select(col.str.split('/'))
     )

shape: (5_791, 1)
┌─────────────────────────────────┐
│ Tweet permalink                 │
│ ---                             │
│ list[str]                       │
╞═════════════════════════════════╡
│ ["https:", "", … "121258051790… │
│ ["https:", "", … "121258249482… │
│ ["https:", "", … "121261373569… │
│ ["https:", "", … "121291174961… │
│ ["https:", "", … "121292055602… │
│ …                               │
│ ["https:", "", … "147530066185… │
│ ["https:", "", … "147551814369… │
│ ["https:", "", … "147589144124… │
│ ["https:", "", … "147645381975… │
│ ["https:", "", … "147702472205… │
└─────────────────────────────────┘


In [18]:
print([m for m in dir(col.list)
       if not m.startswith('_')])

['all', 'any', 'arg_max', 'arg_min', 'concat', 'contains', 'count_matches', 'diff', 'drop_nulls', 'eval', 'explode', 'first', 'gather', 'gather_every', 'get', 'head', 'join', 'last', 'len', 'max', 'mean', 'median', 'min', 'n_unique', 'reverse', 'sample', 'set_difference', 'set_intersection', 'set_symmetric_difference', 'set_union', 'shift', 'slice', 'sort', 'std', 'sum', 'tail', 'to_array', 'to_struct', 'unique', 'var']


In [19]:
print(twit
      .select(col.str.split('/')
              .list.len())
     )

shape: (5_791, 1)
┌─────────────────┐
│ Tweet permalink │
│ ---             │
│ u32             │
╞═════════════════╡
│ 6               │
│ 6               │
│ 6               │
│ 6               │
│ 6               │
│ …               │
│ 6               │
│ 6               │
│ 6               │
│ 6               │
│ 6               │
└─────────────────┘


In [20]:
print(twit
      .select(col.str.split('/')
              .list.to_struct())
     )

shape: (5_791, 1)
┌─────────────────────────────────┐
│ Tweet permalink                 │
│ ---                             │
│ struct[6]                       │
╞═════════════════════════════════╡
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ …                               │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
│ {"https:","","twitter.com","__… │
└─────────────────────────────────┘


In [21]:
# Converting `struct` into columns with `.unnest`
print(twit
      .select(col.str.split('/')
             .list.to_struct())
      .unnest('Tweet permalink')
     )

shape: (5_791, 6)
┌─────────┬─────────┬─────────────┬───────────────┬─────────┬─────────────────────┐
│ field_0 ┆ field_1 ┆ field_2     ┆ field_3       ┆ field_4 ┆ field_5             │
│ ---     ┆ ---     ┆ ---         ┆ ---           ┆ ---     ┆ ---                 │
│ str     ┆ str     ┆ str         ┆ str           ┆ str     ┆ str                 │
╞═════════╪═════════╪═════════════╪═══════════════╪═════════╪═════════════════════╡
│ https:  ┆         ┆ twitter.com ┆ __mharrison__ ┆ status  ┆ 1212580517905780737 │
│ https:  ┆         ┆ twitter.com ┆ __mharrison__ ┆ status  ┆ 1212582494828036097 │
│ https:  ┆         ┆ twitter.com ┆ __mharrison__ ┆ status  ┆ 1212613735698690049 │
│ https:  ┆         ┆ twitter.com ┆ __mharrison__ ┆ status  ┆ 1212911749617242113 │
│ https:  ┆         ┆ twitter.com ┆ __mharrison__ ┆ status  ┆ 1212920556028252160 │
│ …       ┆ …       ┆ …           ┆ …             ┆ …       ┆ …                   │
│ https:  ┆         ┆ twitter.com ┆ __mharrison__ ┆ status

In [22]:
print(twit
      .select(col.str.split('/')
              .list.to_struct())
      .unnest('Tweet permalink')
      .to_struct()
     )

shape: (5_791,)
Series: '' [struct[6]]
[
	{"https:","","twitter.com","__mharrison__","status","1212580517905780737"}
	{"https:","","twitter.com","__mharrison__","status","1212582494828036097"}
	{"https:","","twitter.com","__mharrison__","status","1212613735698690049"}
	{"https:","","twitter.com","__mharrison__","status","1212911749617242113"}
	{"https:","","twitter.com","__mharrison__","status","1212920556028252160"}
	…
	{"https:","","twitter.com","__mharrison__","status","1475300661851934721"}
	{"https:","","twitter.com","__mharrison__","status","1475518143690801156"}
	{"https:","","twitter.com","__mharrison__","status","1475891441243025408"}
	{"https:","","twitter.com","__mharrison__","status","1476453819751878656"}
	{"https:","","twitter.com","__mharrison__","status","1477024722051158018"}
]


In [23]:
print(twit
      .select(col.str.split('/')
              .list.join('/')
             )
     )

shape: (5_791, 1)
┌─────────────────────────────────┐
│ Tweet permalink                 │
│ ---                             │
│ str                             │
╞═════════════════════════════════╡
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ …                               │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
│ https://twitter.com/__mharriso… │
└─────────────────────────────────┘


In [24]:
print(twit
      .select(col.str.split('/')
              .list.to_struct())
      .select(pl.all().map_elements(lambda elem: list(elem)))          
     )

shape: (5_791, 1)
┌─────────────────────────────────┐
│ Tweet permalink                 │
│ ---                             │
│ list[str]                       │
╞═════════════════════════════════╡
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ …                               │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
│ ["field_0", "field_1", … "fiel… │
└─────────────────────────────────┘


  .select(pl.all().map_elements(lambda elem: list(elem)))


In [25]:
print(twit
      .with_columns(username=col.str.split('/')
                    .list.get(3))
     )

shape: (5_791, 12)
┌────────────┬────────────┬────────────┬────────────┬───┬─────────┬───────┬────────────┬───────────┐
│ Tweet id   ┆ Tweet      ┆ Tweet text ┆ time       ┆ … ┆ replies ┆ likes ┆ user       ┆ username  │
│ ---        ┆ permalink  ┆ ---        ┆ ---        ┆   ┆ ---     ┆ ---   ┆ profile    ┆ ---       │
│ i64        ┆ ---        ┆ str        ┆ str        ┆   ┆ f64     ┆ f64   ┆ clicks     ┆ str       │
│            ┆ str        ┆            ┆            ┆   ┆         ┆       ┆ ---        ┆           │
│            ┆            ┆            ┆            ┆   ┆         ┆       ┆ f64        ┆           │
╞════════════╪════════════╪════════════╪════════════╪═══╪═════════╪═══════╪════════════╪═══════════╡
│ 1212580517 ┆ https://tw ┆ Sounds     ┆ 2020-01-02 ┆ … ┆ 0.0     ┆ 3.0   ┆ 3.0        ┆ __mharris │
│ 905780737  ┆ itter.com/ ┆ like a     ┆ 03:44:00+0 ┆   ┆         ┆       ┆            ┆ on__      │
│            ┆ __mharriso ┆ great      ┆ 0:00       ┆   ┆         ┆     

In [26]:
print(twit
      .with_columns(username=col.str.split('/')
                    .list[3])
     )

shape: (5_791, 12)
┌────────────┬────────────┬────────────┬────────────┬───┬─────────┬───────┬────────────┬───────────┐
│ Tweet id   ┆ Tweet      ┆ Tweet text ┆ time       ┆ … ┆ replies ┆ likes ┆ user       ┆ username  │
│ ---        ┆ permalink  ┆ ---        ┆ ---        ┆   ┆ ---     ┆ ---   ┆ profile    ┆ ---       │
│ i64        ┆ ---        ┆ str        ┆ str        ┆   ┆ f64     ┆ f64   ┆ clicks     ┆ str       │
│            ┆ str        ┆            ┆            ┆   ┆         ┆       ┆ ---        ┆           │
│            ┆            ┆            ┆            ┆   ┆         ┆       ┆ f64        ┆           │
╞════════════╪════════════╪════════════╪════════════╪═══╪═════════╪═══════╪════════════╪═══════════╡
│ 1212580517 ┆ https://tw ┆ Sounds     ┆ 2020-01-02 ┆ … ┆ 0.0     ┆ 3.0   ┆ 3.0        ┆ __mharris │
│ 905780737  ┆ itter.com/ ┆ like a     ┆ 03:44:00+0 ┆   ┆         ┆       ┆            ┆ on__      │
│            ┆ __mharriso ┆ great      ┆ 0:00       ┆   ┆         ┆     

### Extract The Username with a Regular Expression

In [27]:
regex = r'^https:\/\/twitter\.com\/([a-zA-Z0-9_]+)\/status\/(\d+)$'

In [28]:
print(twit
      .select(user=col.str.extract(regex, group_index=1))
     )

shape: (5_791, 1)
┌───────────────┐
│ user          │
│ ---           │
│ str           │
╞═══════════════╡
│ __mharrison__ │
│ __mharrison__ │
│ __mharrison__ │
│ __mharrison__ │
│ __mharrison__ │
│ …             │
│ __mharrison__ │
│ __mharrison__ │
│ __mharrison__ │
│ __mharrison__ │
│ __mharrison__ │
└───────────────┘
