In [1]:
import pandas as pd
import re

# Load the 'babynames.csv' data (assuming it's structured similarly to the R babynames dataset)
babynames = pd.read_csv('data/babynames.csv')

# Various string operations
string1 = "This is a string"
string2 = 'If I want to include a "quote" inside a string, I use single quotes'
double_quote = "\""
single_quote = "'"
backslash = "\\"

x = [single_quote, double_quote, backslash]
print(x)

# The str_view function doesn't have a direct Python counterpart; this is a visualization function in R.
# Similarly, many of the next str_view calls are specific to R's string visualization.

tricky = r'''double_quote = "\"" # or '"'
single_quote = '\'' # or "'" '''
# Visualization skipped

tricky = r'''(double_quote = "\"" # or '"'
single_quote = '\'' # or "'")'''
# Visualization skipped

x = ["one\ntwo", "one\ttwo", "µ", "😄"]
print(x)
# Visualization skipped

x = "This\u00a0is\u00a0tricky"
print(x)

# String concatenation
print("x" + "y")
print("".join(["x", "y", "z"]))
print(["Hello " + name for name in ["John", "Susan"]])

df = pd.DataFrame({'name': ["Flora", "David", "Terra", None]})
df['greeting'] = "Hi " + df['name'].fillna('you') + "!"
df['greeting1'] = "Hi " + df['name'].fillna('you') + "!"
df['greeting2'] = df['name'].apply(lambda x: f"Hi {x}!" if pd.notnull(x) else "Hi!")

# More string operations
# In Python, we don't have str_glue like in R; format or f-strings are common ways.
df['greeting'] = df['name'].apply(lambda x: f"Hi {x}!")
df['greeting'] = df['name'].apply(lambda x: f"{{Hi {x}!}}")

print("".join(["x", "y", "z"]))
print(", ".join(["x", "y", "z"]))
print(", ".join(["x", "y"]) + ", and " + "z")

df = pd.DataFrame({
    'name': ['Carmen', 'Carmen', 'Marvin', 'Terence', 'Terence', 'Terence'],
    'fruit': ['banana', 'apple', 'nectarine', 'cantaloupe', 'papaya', 'mandarin']
})
grouped = df.groupby('name').agg(lambda x: ", ".join(x))

print("hi " + str(None)) # This will raise a TypeError in Python
food = 'apple'
price = '$2.99'
print("The price of " + food + " is " + price)
print(f"The price of {food} is {price}")

# Data manipulation
df1 = pd.DataFrame({'x': ["a,b,c", "d,e", "f"]})
df1 = df1['x'].str.split(',', expand=True)

df2 = pd.DataFrame({'x': ["1211", "131", "21"]})
df2 = df2['x'].apply(list).apply(pd.Series)

df3 = pd.DataFrame({'x': ["a10.1.2022", "b10.2.2011", "e15.1.2015"]})
df3[['code', 'edition', 'year']] = df3['x'].str.split('.', expand=True)

["'", '"', '\\']
['one\ntwo', 'one\ttwo', 'µ', '😄']
This is tricky
xy
xyz
['Hello John', 'Hello Susan']
xyz
x, y, z
x, y, and z
hi None
The price of apple is $2.99
The price of apple is $2.99


In [3]:
import pandas as pd

# 1. Separate a column into multiple columns using given positions
df4 = pd.DataFrame({'x': ["202215TX", "202122LA", "202325CA"]})
df4[['year', 'age', 'state']] = df4['x'].apply(lambda s: pd.Series([s[:4], s[4:6], s[6:]]))

# 2. Separate a column into multiple columns using a delimiter
df = pd.DataFrame({'x': ["1-1-1", "1-1-2", "1-3", "1-3-2", "1"]})
df[['x', 'y', 'z']] = df['x'].str.split('-', expand=True)

# Checking the debug view and filter non-ok values (In Python, we don't have `too_few` option directly, but can mimic it)
debug = df.copy()
debug['x_ok'] = ~debug['x'].isna()
print(debug[~debug['x_ok']])

# Handle missing values by aligning at the start
df.fillna('', inplace=True)

# Separate and handle too many delimiters
df = pd.DataFrame({'x': ["1-1-1", "1-1-2", "1-3-5-6", "1-3-2", "1-3-5-7-9"]})
df[['x', 'y', 'z']] = df['x'].str.split('-', n=2, expand=True)

# Skipping rlang::last_trace() as it's R-specific

# Check too_many debug view in Python
debug = df.copy()
debug['x_ok'] = debug['x'].apply(lambda x: len(x.split('-')) <= 3)
print(debug[~debug['x_ok']])

# Handle cases with too many delimiters by dropping extra values
# This is already achieved by the earlier split with `n=2` parameter

# Handle cases with too many delimiters by merging extra values
# This is specific and might require custom logic based on exact needs

# String operations
print(len("a"))
print(len("R for data science"))
print(pd.isnull(None))

print(babynames.groupby(babynames['name'].str.len())['n'].sum())
print(babynames[babynames['name'].str.len() == 15].groupby('name')['n'].sum().sort_values(ascending=False))

x = ["Apple", "Banana", "Pear"]
print([s[:3] for s in x])
print([s[-3:] for s in x])
print("a"[:5])

babynames['first'] = babynames['name'].str[0]
babynames['last'] = babynames['name'].str[-1]

print(list("Hadley".encode()))
x1 = "text\nEl Niño was particularly bad this year"
# Skipping read_csv(x1)$text as the content is a string and not a file in Python

x2 = "text\n\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"
# Skipping read_csv(x2)$text similarly

# For handling different encodings, we would need more context about the encoding of x1 and x2

u = ["ü", "ü"]
print(u)
print([len(s) for s in u])
print([s[0] for s in u])
print(u[0] == u[1])

# Skipping str_equal as Python strings are inherently unicode and equal comparison works out of the box

print("i".upper())
print("ı".upper())  # Python doesn't have direct locale support for string upper, might need external libraries


Empty DataFrame
Columns: [x, y, z, x_ok]
Index: []
Empty DataFrame
Columns: [x, y, z, x_ok]
Index: []
1
18
True
name
2       338150
3      8589596
4     48506739
5     87011607
6     90749404
7     72120767
8     25404066
9     11926551
10     1306159
11     2135827
12       16295
13       10845
14        3681
15         830
Name: n, dtype: int64
name
Franciscojavier    123
Christopherjohn    118
Johnchristopher    118
Christopherjame    108
Christophermich     52
Ryanchristopher     45
Mariadelosangel     28
Jonathanmichael     25
Christianjoseph     22
Christopherjose     22
Mariadelrosario     16
Christiananthon     12
Matthewalexande     11
Christianmichae     11
Christopherryan     11
Seanchristopher     10
Ashleyelizabeth      8
Christiandaniel      7
Gabrielalexande      6
Markchristopher      6
Christopheranth      6
Laurenelizabeth      5
Michaelchristop      5
Christianjoshua      5
Christopherdavi      5
Christianalexan      5
Hannahelizabeth      5
Joshuaalexander      5
Jo