# String Manipulation

### Loading Libraries

In [1]:
# ZipFiles
import zipfile

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import polars as pl
import polars.selectors as cs

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# XGBoost
import xgboost as xgb

### Loading Data: `Twitter Data`

In [2]:
url = 'https://github.com/mattharrison/datasets/raw/' \
'master/data/__mharrison__2020-2021.csv'

In [3]:
raw = pl.read_csv(url)

In [4]:
print(raw)

shape: (5_791, 40)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Tweet id  ┆ Tweet     ┆ Tweet     ┆ time      ┆ … ┆ promoted  ┆ promoted  ┆ promoted  ┆ promoted │
│ ---       ┆ permalink ┆ text      ┆ ---       ┆   ┆ email     ┆ dial      ┆ media     ┆ media    │
│ i64       ┆ ---       ┆ ---       ┆ str       ┆   ┆ tweet     ┆ phone     ┆ views     ┆ engageme │
│           ┆ str       ┆ str       ┆           ┆   ┆ ---       ┆ ---       ┆ ---       ┆ nts      │
│           ┆           ┆           ┆           ┆   ┆ str       ┆ str       ┆ str       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 121258051 ┆ https://t ┆ Sounds    ┆ 2020-01-0 ┆ … ┆ null      ┆ null      ┆ null      ┆ null     │
│ 790578073 ┆ witter.co ┆ like a    ┆ 2 03:44:0 ┆   ┆           ┆       

#### Tweaking Tweet Function:

In [5]:
def tweak_twit(df):
    return (df
            .select(['Tweet id', 'Tweet permalink', 'Tweet text', 'time', 
                     'impressions', 'engagements', 'engagement rate',
                     'retweets', 'replies', 'likes', 'user profile clicks'])
           )

In [6]:
twit = tweak_twit(raw)

In [7]:
print(twit)

shape: (5_791, 11)
┌─────────────┬─────────────┬─────────────┬─────────────┬───┬──────────┬─────────┬───────┬─────────┐
│ Tweet id    ┆ Tweet       ┆ Tweet text  ┆ time        ┆ … ┆ retweets ┆ replies ┆ likes ┆ user    │
│ ---         ┆ permalink   ┆ ---         ┆ ---         ┆   ┆ ---      ┆ ---     ┆ ---   ┆ profile │
│ i64         ┆ ---         ┆ str         ┆ str         ┆   ┆ f64      ┆ f64     ┆ f64   ┆ clicks  │
│             ┆ str         ┆             ┆             ┆   ┆          ┆         ┆       ┆ ---     │
│             ┆             ┆             ┆             ┆   ┆          ┆         ┆       ┆ f64     │
╞═════════════╪═════════════╪═════════════╪═════════════╪═══╪══════════╪═════════╪═══════╪═════════╡
│ 12125805179 ┆ https://twi ┆ Sounds like ┆ 2020-01-02  ┆ … ┆ 0.0      ┆ 0.0     ┆ 3.0   ┆ 3.0     │
│ 05780737    ┆ tter.com/__ ┆ a great     ┆ 03:44:00+00 ┆   ┆          ┆         ┆       ┆         │
│             ┆ mharriso…   ┆ topic! htt… ┆ :00         ┆   ┆          ┆

### Data Validation

Some accessor already are deprecated. 

In [8]:
col = pl.col('Tweet permalink')

In [9]:
print([m for m in dir(col.str)
      if not m.startswith('_')])

['concat', 'contains', 'contains_any', 'count_matches', 'decode', 'encode', 'ends_with', 'escape_regex', 'explode', 'extract', 'extract_all', 'extract_groups', 'extract_many', 'find', 'find_many', 'head', 'join', 'json_decode', 'json_path_match', 'len_bytes', 'len_chars', 'pad_end', 'pad_start', 'replace', 'replace_all', 'replace_many', 'reverse', 'slice', 'split', 'split_exact', 'splitn', 'starts_with', 'strip_chars', 'strip_chars_end', 'strip_chars_start', 'strip_prefix', 'strip_suffix', 'strptime', 'tail', 'to_date', 'to_datetime', 'to_decimal', 'to_integer', 'to_lowercase', 'to_time', 'to_titlecase', 'to_uppercase', 'zfill']


In [11]:
# Checking overlaps
print(sorted([m for m in
             set(dir(col.str)) & set(dir(''))
             if not m.startswith('_')]))

['encode', 'find', 'join', 'replace', 'split', 'zfill']
