# Strings

https://docs.python.org/3/library/string.html#formatspec


In [None]:
'Here is a string'

'Here is a string'

In [None]:
"Here is another string"


'Here is another string'

In [None]:
"This is Joe's"

"This is Joe's"

In [None]:
"The 'best' part of waking up is coffee."

"The 'best' part of waking up is coffee."

In [None]:
'The \'best\' part of waking up is coffee.'

"The 'best' part of waking up is coffee."

In [None]:
"""The "best" part of waking up is coffee."""

'The "best" part of waking up is coffee.'

In [None]:
"The \"best\" part of waking up is coffee."

'The "best" part of waking up is coffee.'

## Remove whitespace
* leading or trail  

In [None]:
S = ' Joe Smith  M '
S

' Joe Smith  M '

In [None]:
S.strip()

'Joe Smith  M'

In [None]:
S.lstrip()

'Joe Smith  M '

In [None]:
S.rstrip()

' Joe Smith  M'

In [None]:
S = '\t Joe \n Smith'
S

'\t Joe \n Smith'

In [None]:
S.strip()

'Joe \n Smith'

In [None]:
T = S.split()
T

['Joe', 'Smith']

In [None]:
''.join(T)

'JoeSmith'

In [None]:
T[0]+" "+T[1]

'Joe Smith'

In [None]:
S = 'The big red dog bites.'

In [None]:
U=S.upper()
U

'THE BIG RED DOG BITES.'

In [None]:
T = S.lower()
T

'the big red dog bites.'

In [None]:
T.capitalize()

'The big red dog bites.'

In [None]:
U.title()

'The Big Red Dog Bites.'

In [None]:
S = 'I like to eat apples and bananas.'

In [None]:
S.replace('a','e')

'I like to eet epples end benenes.'

In [None]:
T = S.split()
T.reverse()
T

['bananas.', 'and', 'apples', 'eat', 'to', 'like', 'I']

In [None]:
S = 'I like to eat apples and bananas.  I like to share them with my grandma.  She likes to share them with her dog.'

In [None]:
S.index('ea')

10

In [None]:
S[10]+S[11]

'ea'

In [None]:
S[10:12]

'ea'

In [None]:
ord('a')

97

In [None]:
chr(97)

'a'

In [None]:
import random

password = [None]*20
len(password)

for i in range(len(password)):
  password[i] = chr(random.randint(33,126))

''.join(password)



'$X{JZ>{fA_V#ZPnlKK0e'

In [None]:
S.count('to',20, 53)

1

In [None]:
S.startswith('I like')

True

In [None]:
S.endswith('me')

False

### String Comparison

In [None]:
'apple'<'Apple'  
# 97 < 65 ?

False

## Fomatting Strings

In [None]:
pi = 3.141592

In [None]:
print(f'{pi:.2f}')

3.14


In [None]:
print(f'{pi:<15f}')

3.141592       


In [None]:
print(f'{pi:>15f}')

       3.141592


In [None]:
print(f'{"hello":>15}')

          hello


In [None]:
print(f'{pi:>10}\n{1234.5:>10}')

  3.141592
    1234.5


In [None]:
print(f'{12345678:,d}')

12,345,678


In [None]:
print(f'{12345678:_d}')

12_345_678


In [None]:
x = f'{12345678:_d}'
x


'12_345_678'

In [None]:
x.replace('_','')

'12345678'

## `isa` functions
* Determine if X "IS A" Y.
* `isa` functions return `True` or `False`

In [None]:
'23'.isdigit()

True

In [None]:
'123MainStreet'.isalnum()

True

In [None]:
'asdfdlkdlkdlkjdkjd'.isalpha()

True

In [None]:
'12.23'.isdecimal()

False

In [None]:
'asdf'.islower()

True

In [None]:
'UIOUsdf'.isupper()

False

In [None]:
'The Red Door Is Res'.istitle()

True

In [None]:
'x'.isidentifier()

True

In [None]:
x.isidentifier()

False

In [None]:
'x__456'.isidentifier()

True

## Regular Expressions
* Match __patterns__ in string, substring, etc.
* Very powerful, but can be difficult
* Python has library `re` for '**r**egular **e**xpressions'

> __Regular expression__ string describes a searach pattern for matching characters in other strings.

In [None]:
# Import regular expression library
import re

#### Validating Data
_regex_ can be used to validate data.  

Examples:
* Checking ZIP code data contains proper number of characters.
* A string contains only letters, spaces, apostrophes, and hyphens (e.g., as last names might)
* An email address
* Social Security number is of the form: xxx-xx-xxxx.

Regex **can also be used to**:
* Extract data
* Clean data
* Transform data

#### `fullmatch()` function in `re` module

`fullmatch` is used to determine if the *entire* string in its second argument matches the pattern of its first.  

In [None]:
pattern = '04005'  # regex literal
'String Matches the pattern' if re.fullmatch(pattern,'04005') else 'No match'

'String Matches the pattern'

In [None]:
words = 'Here are some words.'
toke = words.split(' ')
' '.join(toke)

'Here are some words.'

In [None]:
# See url below for reason for 'r' preceeding expression starting with \.

# https://docs.python.org/3/library/re.html
'Good' if re.fullmatch(r'\d{5}',pattern) else 'Bad'

'Good'

In [None]:
# Same as 
'Good' if re.fullmatch(r'\d\d\d\d\d',pattern) else 'Bad'

'Good'

#### Metacharacters and Character Classes

Regex typically contain **metacharacters**.  

[] {} ()  \ * + ^ $ ?  .  | 

** Character Classes**
* \d is any digit 0-9
* \D is any non digit character
* \s is whitespace (space, tab, etc.)
* \S is non-whitespace characters
* \w is alpha-numeric characters (including _)
* \W non alpha-numeric character.

Predefined set of characters (e.g., \d = digits 0-9).

In [None]:
'True' if re.fullmatch('[A-Z][a-z]*','here') else 'False'

'False'

In [None]:
'True' if re.fullmatch('[^A-Z]*','here') else 'False'

'True'

In [None]:
#### Wildcards / Quantifiers

# `*` matches zero or more occurrences
'Yes' if re.fullmatch('[A-Z][a-z]*','Hello') else 'No'

'No'

In [None]:
# `^` matches any character NOT specified.  
# [^a-z] matches any character NOT lowercase.
'Yes' if re.fullmatch('[^A-Z][a-z]*','Hello') else 'No'

'No'

In [None]:
# `+` matches at least one occurrence
'Yes' if re.fullmatch('[A-Z][a-z]+','Hello') else 'No'

'Yes'

In [None]:
# `?` matches zero or one occurrence
'Yes' if re.fullmatch('Hell?o','Helllo') else 'No'

'No'

In [None]:
# {n,} = at least n occurrences
'Yes' if re.fullmatch(r'\D{3,}','Hello') else 'No'

'Yes'

In [52]:
# {m,n} = between m and n occurrences inclusive
'Yes' if re.fullmatch(r'\D{2,5}','Hellos') else 'No'

'No'

In [57]:
street =r'\d+ [A-Z][a-z]* [A-Z][a-z]*'
'Match' if re.fullmatch(street,'134 Main St') else 'No'

'No'

In [66]:
'Yes' if re.fullmatch('[A-Za-z]*','HeLLo') else 'No'

'Yes'

In [69]:
'Yes' if re.fullmatch(r'\D{5} \d{3}','HeLLo 123') else 'No'

'Yes'

### Data Wrangling
* deleting missing values
* substitute missing values
* delete outliers
* removing duplicates
* combining features
* subsampling data
* normalizing data
* grouping data

### Pandas

* `match`
* `contains`


In [70]:
import pandas as pd

In [74]:
cities = pd.Series(['Boston, MA 02215','Miami, FL 33101'])
cities

0    Boston, MA 02215
1     Miami, FL 33101
dtype: object

In [75]:
cities.str.contains(' [A-Z]{2} ')

0    True
1    True
dtype: bool

In [None]:
import random
import numpy as np

x= np.random.randint(4, size=10)
x

array([3, 2, 2, 0, 0, 0, 0, 0, 0, 3])

In [112]:
import numpy as np

article = ['a', 'the', 'an']
noun = ['dog', 'cat', 'camera', 'car','oil']
verb = ['go', 'runs', 'jumps', 'moves', 'swam', 'drank', 'smile','smoke']
prep = ['in','to','of']


for i in range(3):
  Sentence = []
  x = np.random.randint(len(article))
  Sentence.append(str.title(article[x]))

  x = np.random.randint(len(noun))
  Sentence.append(noun[x])

  x = np.random.randint(len(verb))
  Sentence.append(verb[x])

  x = np.random.randint(len(prep))
  Sentence.append(prep[x])

  x = np.random.randint(len(article))
  Sentence.append(article[x])

  x = np.random.randint(len(noun))
  Sentence.append(noun[x]+'.')
  
  Sentence = ' '.join(Sentence)
  print(Sentence)


The camera moves to a cat.
The cat smile of the cat.
An car drank to the car.


In [111]:
Sentence

'A camera swam to an dog.'

In [79]:
y = []
y.append(noun[2])

In [84]:
 x = np.random.randint(len(article))
 x

2

In [107]:
noun[2]+'.'

'camera.'