# The Zen of Python

In [None]:
import this

# Grammar for Strings

```
stringliteral   ::=  [stringprefix](shortstring | longstring)
stringprefix    ::=  "r" | "u" | "ur" | "R" | "U" | "UR" | "Ur" | "uR"
                     | "b" | "B" | "br" | "Br" | "bR" | "BR"
shortstring     ::=  "'" shortstringitem* "'" | '"' shortstringitem* '"'
longstring      ::=  "'''" longstringitem* "'''" | '"""' longstringitem* '"""'
shortstringitem ::=  shortstringchar | escapeseq
longstringitem  ::=  longstringchar | escapeseq
shortstringchar ::=  <any source character except "\" or newline or the quote>
longstringchar  ::=  <any source character except "\">
escapeseq       ::=  "\" <any ASCII character>
```

# String Types

In [None]:
new_string = "This is a String"  # storing a string

print('ID:', id(new_string))  # shows the object identifier (address)
print('Type:', type(new_string))  # shows the object type
print('Value:', new_string)  # shows the object value

### Simple String

In [None]:
simple_string = 'Hello!' + " I'm a simple string"
print(simple_string)

### Multi-line String

In [None]:
# Note the \n (newline) escape character automatically created
multi_line_string = """Hello I'm
a multi-line
string!"""

multi_line_string

In [None]:
print(multi_line_string)

### Escape sequences

In [None]:
# Normal string with escape sequences leading to a wrong file path!
escaped_string = "C:\the_folder\new_dir\file.txt"
print(escaped_string)  # will cause errors if we try to open a file here

In [None]:
# raw string keeping the backslashes in its normal form
raw_string = r'C:\the_folder\new_dir\file.txt'
print(raw_string)

### Unicode literals

In [None]:
# unicode string literals
string_with_unicode = 'H\u00e8llo!'
print(string_with_unicode)

In [None]:
more_unicode = 'I love Pizza 🍕!  Shall we book a cab 🚕 to get pizza?'
print(more_unicode)

## Your Turn: How can we reverse the above string?

In [None]:
more_unicode[______]  # reverses the string

# String Operations


### String Concatenation

In [None]:
'Hello 😊' + ' and welcome ' + 'to Python 🐍!'

In [None]:
'Hello 😊' ' and welcome ' 'to Python 🐍!'

In [None]:
s3 = ('This '
      'is another way '
      'to concatenate '
      'several strings!')
s3

### Substring check

In [None]:
'way' in s3

### Your Turn: Validate if the string 'python' is not present in string s3

In [None]:
_______ s3

### String Length

In [None]:
len(s3)

# String Indexing and Slicing

In [None]:
# creating a string
s = 'PYTHON'
s, type(s)

## String Indexing

In [None]:
# depicting string indexes
for index, character in enumerate(s):
    print('Character ->', character, 'has index->', index)

In [None]:
s[0], s[1], s[2], s[3], s[4], s[5]

In [None]:
s[-1], s[-2], s[-3], s[-4], s[-5], s[-6]

## String Slicing

In [None]:
s[:] 

In [None]:
s[1:4]

In [None]:
s[:3], s[3:]

In [None]:
s[-3:]

In [None]:
s[:3] + s[3:]

In [None]:
s[:3] + s[-3:]

## String slicing with offsets

In [None]:
s[::1]  # no offset

In [None]:
s[::2]  # print every 2nd character in string

# String Immutability

In [None]:
# strings are immutable hence assignment throws error
s[0] = 'X'

In [None]:
print('Original String id:', id(s))
# creates a new string
s = 'X' + s[1:]
print(s)
print('New String id:', id(s))

# Useful String methods

## Case Conversions

In [None]:
s = 'python is great'

In [None]:
s.capitalize()

In [None]:
s.upper()

In [None]:
s.title()

## String Replace

In [None]:
s.replace('python', 'NLP')

## Numeric Checks

In [None]:
'12345'.isdecimal()

In [None]:
'apollo11'.isdecimal()

## Alphabet Checks

In [None]:
'python'.isalpha()

In [None]:
'number1'.isalpha()

## Alphanumeric Checks

In [None]:
'total'.isalnum()

In [None]:
'abc123'.isalnum()

In [None]:
'1+1'.isalnum()

## String splitting and joining

In [None]:
s = 'I,am,a,comma,separated,string'
s

In [None]:
s.split(',')

In [None]:
' '.join(s.split(','))

In [None]:
# stripping whitespace characters
s = '   I am surrounded by spaces    '
s

In [None]:
s.strip()

In [None]:
sentences = 'Python is great. NLP is also good.'
sentences.split('.')

In [None]:
print('\n'.join(sentences.split('.')))

In [None]:
print('\n'.join([sentence.strip() 
                     for sentence in sentences.split('.') 
                         if sentence]))

# String formatting

## Formatting expressions with different data types - old style

In [None]:
'We have %d %s containing %.2f gallons of %s' %(2, 'bottles', 2.5, 'milk')

In [None]:
'We have %d %s containing %.2f gallons of %s' %(5.21, 'jugs', 10.86763, 'juice')

## Formatting strings using the format method - new style

In [None]:
'Hello {} {}, it is a great {} to meet you at {}'.format('Mr.', 'Jones', 'pleasure', 5)

In [None]:
'Hello {} {}, it is a great {} to meet you at {} o\' clock'.format('Sir', 'Arthur', 'honor', 9)

## Alternative ways of using string format

In [None]:
'I have a {food_item} and a {drink_item} with me'.format(drink_item='soda', food_item='sandwich')

In [None]:
'The {animal} has the following attributes: {attributes}'.format(animal='dog', attributes=['lazy', 'loyal'])

# Regular Expressions

In [None]:
s1 = 'Python is an excellent language'
s2 = 'I love the Python language. I also use Python to build applications at work!'

In [None]:
import re

pattern = 'python'
# match only returns a match if regex match is found at the beginning of the string
re.match(pattern, s1)

In [None]:
# pattern is in lower case hence ignore case flag helps
# in matching same pattern with different cases
re.match(pattern, s1, flags=re.IGNORECASE)

In [None]:
# printing matched string and its indices in the original string
m = re.match(pattern, s1, flags=re.IGNORECASE)
print('Found match {} ranging from index {} - {} in the string "{}"'.format(m.group(0), 
                                                                            m.start(), 
                                                                            m.end(), s1))

In [None]:
# match does not work when pattern is not there in the beginning of string s2
re.match(pattern, s2, re.IGNORECASE)

In [None]:
# illustrating find and search methods using the re module
re.search(pattern, s2, re.IGNORECASE)

In [None]:
re.findall(pattern, s2, re.IGNORECASE)

In [None]:
match_objs = re.finditer(pattern, s2, re.IGNORECASE)
match_objs

In [None]:
print("String:", s2)
for m in match_objs:
    print('Found match "{}" ranging from index {} - {}'.format(m.group(0), 
                                                               m.start(), m.end()))

In [None]:
# illustrating pattern substitution using sub and subn methods
re.sub(pattern, 'Java', s2, flags=re.IGNORECASE)

In [None]:
re.subn(pattern, 'Java', s2, flags=re.IGNORECASE)

In [None]:
# dealing with unicode matching using regexes
s = u'H\u00e8llo! this is Python 🐍'
s

In [None]:
re.findall(r'\w+', s)

In [None]:
re.findall(r"[A-Z]\w+", s)

In [None]:
emoji_pattern = r"['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']"
re.findall(emoji_pattern, s, re.UNICODE)

# Putting it all together - Basic Text Processing and Analysis

In [None]:
from nltk.corpus import gutenberg
import seaborn as sns

%matplotlib inline

bible = gutenberg.open('bible-kjv.txt')
bible = bible.readlines()

bible[:5]

In [None]:
# Total lines
len(bible)

In [None]:
[item.strip('\n') for item in bible[:5]]

In [None]:
bible = list(filter(None, [item.strip('\n') 
                               for item in bible]))
bible[:5]

In [None]:
len(bible)

### Finding out the distribution of sentence lengths in the Bible

In [None]:
line_lengths = [len(sentence) for sentence in bible]
p = sns.kdeplot(line_lengths, shade=True, color='red')

In [None]:
# splitting each sentence to get words
tokens = [item.split() for item in bible]
print(tokens[:5])

### Your Turn: Plot the distribution of the typical sentence word counts across the Bible

In [None]:
total_tokens_per_line = [_______ for _____ in ______]
p = sns.kdeplot(________, shade=True, color='orange')

### Flattening a list of lists

Note the two for statements in the list comprehension below which we use to convert our list of tokenized sentences into one big flat list of tokens.

In [None]:
words = [word for sentence in tokens for word in sentence]
print(words[:20])

In [None]:
# only keeping words and removing special characters
words = list(filter(None, [re.sub(r'[^A-Za-z]', '', word) for word in words]))
print(words[:20])

### Finding the top ten most common words

In [None]:
from collections import Counter

words = [word.lower() for word in words]
c = Counter(words)
c.most_common(10)

### Your Turn: Removing stopwords and looking at the top ten most common words

Stopwords are filler words like articles, conjunctions etc which don't carry a lot of information on their own. You have seen some examples in the previous section. Can you remove the stopwords from our previous list of words and show the top ten words?

You can retrieve a list of common stopwords using the standard __`nltk`__ library as follows.

In [None]:
import nltk 

stopwords = nltk.corpus.stopwords.words('english')
stopwords[:10]

Now use the above list and remove the stopwords and show the top ten words!

In [None]:
words = ____
c = ________
c.__________