In [None]:
import re

# 1. String variables ----
# double quotes ""
# single quotes ''

text = "String"
text_nested = 'Single quotation mark with "nested" double quotation'

# How to include quotation marks inside a string

# a) nesting
one_quote_single = "'"
one_quote_double = '"'

# b) using backslash
one_quote_single = '\' '
one_quote_double = "\" "

# To see the raw content of a string, use print()
# or prefix the string with 'r' for raw string
print(one_quote_single)
print(one_quote_double)

# When a backslash is part of a text
# write a backslash in front of it
text_backslash1 = r"Short \ long"  # raw string
text_backslash2 = "Short \\ long"   # escaped backslash

print(text_backslash1)
print(text_backslash2)

# 2. List of strings (equivalent to R's character vector) ----

character_list = ["\" ", " \\ "]
print(character_list)

# Most common special characters:
# "\n"   newline
# "\t"   tab
# unicode characters

character_list2 = ["separation\ttabulation and jump\ninto a newline"]
print(character_list2)

# Unicode characters examples:
# PILCROW SIGN, MICRO SIGN, RIGHT POINTING
character_list3 = ["\u00b6", "\u00b5", "\u00bb"]
print(character_list3)

# Emoji (pictorial symbols) examples:
# Music Notes, Black Star, Telephone
character_list4 = ["\u266c", "\u2605", "\u260f"]
print(character_list4)

# 3. Regular expressions - basic examples ----

expression = ["international", "associations", "intra-organisational",
              "foundations", "technical", "institutions"]

# Function to mimic R's str_view
def str_view(strings, pattern):
    print(f"\nPattern: {pattern}")
    for s in strings:
        match = re.search(pattern, s)
        if match:
            print(f"Match in: {s}")

# Simple pattern matching
str_view(expression, "national")
str_view(expression, "al")
str_view(expression, "int")
str_view(expression, "tions")

# Dot matches any character except newline
str_view(expression, ".a.")
str_view(expression, "t.")
str_view(expression, ".o")

# To match literal dot, escape it
expression_short = ["int.", "assoc.", "intra-org.", "found.", "tech.", "instit."]
str_view(expression_short, r"\.")

# To match literal backslash
text_backslash = "Short \\ long"
str_view([text_backslash], r"\\")

# ^ matches start of string
# $ matches end of string
str_view(expression, "^i")
str_view(expression, "^f")
str_view(expression, "al$")
str_view(expression, "tions$")

cake = ["donut", "custard donut", "donut with plum", "pudding donut"]
str_view(cake, "donut")
str_view(cake, "^donut$")

# Character classes and alternation
samples = ["abc", "a.c", "a*c", "a c", "a8c"]
str_view(samples, r".[*]c")
str_view(samples, r".[^*]c")
str_view(samples, r"a[.]c")
str_view(samples, r".(\.|\d)c")

str_view(["grey", "gray"], "gr(e|a)y")

# \d matches any digit
# \s matches any whitespace
str_view(samples, r"\d")
str_view(samples, r"\s")
str_view(samples, r".(\.|8)c")
str_view(samples, r".(\d|\s)c")

# Exercises

# Exercise 1
vector = ["emoticon", ":)", "symbol", "$^$"]
print("\nExercise 1:")
# a) string of 3 characters with letter o in middle
str_view(vector, "^.o.$")
# b) expression "emoticon"
str_view(vector, "emoticon")
# c) expression ":)"
str_view(vector, r":\)")
# d) expression "$^$"
str_view(vector, r"\$\^\$")

# Exercise 2
# For this exercise, we'll create a small word corpus
words = ["yes", "yesterday", "wholesale", "woman", "fox", "box", "index"]
print("\nExercise 2:")
# a) words containing "yes"

# b) words starting with "w"

# c) words ending with "x"


# Exercise 3
print("\nExercise 3:")
# a) words starting with a vowel

# b) words starting with a consonant

# c) words ending with "ing" or "ise"

# d) words ending with "ed" but not "eed"
