# Python for NLP

Based on: Sarkar: Text Analytics with Python (2nd Edition)
Extended and updated by Heiko Rölke

## Working with strings

In [None]:
new_string = "This is a String"  # storing a string

print('ID:', id(new_string))  # shows the object identifier (address)
print('Type:', type(new_string))  # shows the object type
print('Value:', new_string)  # shows the object value

In [None]:
# simple string
simple_string = 'Hello!' + " I'm a simple string"
print(simple_string)

In [None]:
# multi-line string, note the \n (newline) escape character automatically created
multi_line_string = """Hello I'm
a multi-line
string!"""

multi_line_string

In [None]:
print(multi_line_string)

In [None]:
# Normal string with escape sequences leading to a wrong file path!
escaped_string = "C:\the_folder\new_dir\file.txt"
print(escaped_string)  # will cause errors if we try to open a file here

In [None]:
# raw string keeping the backslashes in its normal form
raw_string = r'C:\the_folder\new_dir\file.txt'
print(raw_string)

In [None]:
# unicode string literals
string_with_unicode = 'H\u00e8llo!'
print(string_with_unicode)

In [None]:
more_unicode = 'I love Pizza 🍕!  Shall we book a cab 🚕 to get pizza?'
print(more_unicode)

In [None]:
print(string_with_unicode + '\n' + more_unicode)

In [None]:
' '.join([string_with_unicode, more_unicode])

## Operationen mit Zeichenketten

### Verkettung

In [None]:
'Hello 😊' + ' and welcome ' + 'to Python 🐍!'

In [None]:
'Hello 😊' ' and welcome ' 'to Python 🐍!'

In [None]:
# concatenation of variables and literals
s1 = 'Python 💻!'
'Hello 😊 ' + s1

In [None]:
'Hello 😊 ' s1

In [None]:
# some more ways of concatenating strings
s2 = '--🐍Python🐍--'
s2 * 5

In [None]:
s1 + s2
(s1 + s2) * 3

In [None]:
# concatenating several strings together in parentheses
s3 = ('This '
      'is another way '
      'to concatenate '
      'several strings!')
s3

In [None]:
# checking for substrings in a string
'way' in s3

In [None]:
'python' in s3

In [None]:
# computing total length of the string
len(s3)

## Indexing and Slicing

In [None]:
s = 'PYTHON'
# depicting string indexes
for index, character in enumerate(s):
    print('Character ->', character, 'has index->', index)

In [None]:
s[0], s[1], s[2], s[3], s[4], s[5]

In [None]:
s[-1], s[-2], s[-3], s[-4], s[-5], s[-6]

In [None]:
## String slicing
s[:]

In [None]:
s[1:4]

In [None]:
s[:3], s[3:]

In [None]:
s[-3:]

In [None]:
s[:3] + s[3:]

In [None]:
s[:3] + s[-3:]

### String slicing with offsets

In [None]:
s[::1]  # no offset

In [None]:
s[::2]  # print every 2nd character in string

In [None]:
s[::-1]  # reverses the string

### strings are immutable

In [None]:
# strings are immutable hence assignment throws error
s[0] = 'X'

In [None]:
print('Original String id:', id(s))
# creates a new string
s = 'X' + s[1:]
print(s)
print('New String id:', id(s))

## Useful String methods

### Case conversions

In [None]:
s = 'python is great'
s.capitalize()

In [None]:
s.upper()

In [None]:
s.title()

### String replace

In [None]:
s.replace('python', 'NLP')

### Numeric checks

In [None]:
'12345'.isdecimal()

In [None]:
'apollo11'.isdecimal()

### Alphabet checks

In [None]:
'python'.isalpha()

In [None]:
'number1'.isalpha()

### Alphanumeric checks

In [None]:
'total'.isalnum()

In [None]:
'abc123'.isalnum()

In [None]:
'1+1'.isalnum()

### String splitting and joining

In [None]:
s = 'I,am,a,comma,separated,string'
s.split(',')

In [None]:
' '.join(s.split(','))

## stripping whitespace characters

In [None]:
s = '   I am surrounded by spaces    '
s

In [None]:
s.strip()

In [None]:
sentences = 'Python is great. NLP is also good.'
sentences.split('.')

In [None]:
print('\n'.join(sentences.split('.')))

In [None]:
print('\n'.join([sentence.strip()
                 for sentence in sentences.split('.')
                 if sentence]))

## String formatting

### Simple string formatting expressions - very old style

In [None]:
'Hello %s' % ('Python!')

In [None]:
'Hello %s %s' % ('World!', 'How are you?')

### Formatting expressions with different data types - very old style 

(C-like)

In [None]:
'We have %d %s containing %.2f gallons of %s' % (2, 'bottles', 2.5, 'milk')

In [None]:
'We have %d %s containing %.2f gallons of %s' % (5.21, 'jugs', 10.86763, 'juice')

### Formatting strings using the format method - old style

In [None]:
'Hello {} {}, it is a great {} to meet you at {}'.format('Mr.', 'Jones', 'pleasure', 5)

In [None]:
'Hello {} {}, it is a great {} to meet you at {} o\' clock'.format('Sir', 'Arthur', 'honor', 9)

### Alternative ways of using string format

In [None]:
'I have a {food_item} and a {drink_item} with me'.format(drink_item='soda', food_item='sandwich')

In [None]:
'The {animal} has the following attributes: {attributes}'.format(animal='dog', attributes=['lazy', 'loyal'])

### New: f-Strings

In [None]:
s_neu = f"2 + 2 ergibt {2+2}"
s_neu

In [None]:
f"This is the string: {s_neu}"

## Regular Expressions

In [None]:
s1 = 'Python is an excellent language'
s2 = 'I love the Python language. I also use Python to build applications at work!'

In [None]:
import re

pattern = 'python'
# match only returns a match if regex match is found at the beginning of the string
re.match(pattern, s1)

In [None]:
# pattern is in lower case hence ignore case flag helps
# in matching same pattern with different cases
re.match(pattern, s1, flags=re.IGNORECASE)

In [None]:
# printing matched string and its indices in the original string
m = re.match(pattern, s1, flags=re.IGNORECASE)
print(f'Found match {m.group(0)} ranging from index {m.start()} - {m.end()} in the string "{s1}"')

In [None]:
# match does not work when pattern is not there in the beginning of string s2
re.match(pattern, s2, re.IGNORECASE)

In [None]:
# illustrating find and search methods using the re module
re.search(pattern, s2, re.IGNORECASE)

In [None]:
re.findall(pattern, s2, re.IGNORECASE)

In [None]:
match_objs = re.finditer(pattern, s2, re.IGNORECASE)
match_objs

In [None]:
# next(match_objs)

In [None]:
print("String:", s2)
for m in match_objs:
    print(f'Found match "{m.group(0)}" ranging from index {m.start()} - {m.end()}')

In [None]:
# illustrating pattern substitution using sub and subn methods
re.sub(pattern, 'Java', s2, flags=re.IGNORECASE)

In [None]:
re.subn(pattern, 'Java', s2, flags=re.IGNORECASE)

In [None]:
# dealing with unicode matching using regexes
s = u'H\u00e8llo! this is Python 🐍'
s

In [None]:
re.findall(r'\w+', s)

In [None]:
re.findall(r"[A-Z]\w+", s, re.UNICODE)

In [None]:
emoji_pattern = r"['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']"
re.findall(emoji_pattern, s, re.UNICODE)

## Exercises

1. Play around with the examples above by changing strings, index and slice numbers etc. Make yourself comfortable with the Python syntax (5 - 10 minutes)
2. Copy a longer text from a website or write one yourself. (>= 100 words)
    1. Go through the text and print all words beginning with a capital letter.
    2. Do the same for all words in lowercase
    3. Store the first two characters of each word in a new string. Remember to add spaces.
    4. Do the same for the last three characters of each word.
3. Use the text from the last exercise. Use regex to filter out all vowels.
4. Invert exercise 3 (keep only vowels).

If you are already knowledgeable in Python and bored by the tasks above, solve them using list comprehensions. 
Come up with more difficult exercises like:
1. Use two longer Texts. Which words occur in both texts?
2. Go from words to character-level. Find the longest character sub-sequence of two texts. This sub-sequence may spread over whitespace and several words.
3. ...  

# numpy and pandas

The most important data handling libraries in Python.

## NumPy

NumPy delivers fast and memory efficient n-dimensional arrays.


In [None]:
# the name "np" is some kind of best practise
import numpy as np

In [None]:
# a one-dimensional array (vector) has n rows and 1 column
a1 = np.array([1,2,3,4])
a1

In [None]:
a2 = np.array([[1,2], [3,4], [5,6]])
a2

### Fast creation of arrays

In [None]:
np.zeros(3)

In [None]:
np.ones((4,4))

In [None]:
# be careful - may contain junk data
np.empty(3)

In [None]:
# supports a variant of the "range" function
np.arange(5)

In [None]:
# start, stop, step
np.arange(3,21,2).reshape((3,3))

### Some example operations

In [None]:
np.sort(np.array([3,5,4,1,2,7,8,9,6]))

Note that there are various ways of sorting along axes (dimensions), partial sorting, etc. We will not got into detail here.

In [None]:
a1 = np.array([[1, 2]])
a2 = np.array([[5, 6]])

np.concatenate((a1, a2))

In [None]:
np.concatenate((a1, a2), axis=1)  # this does only work for fitting dimensions - try it

In [None]:
a3 = np.array([[1, 1], [2, 2]])
a4 = np.array([[3, 3], [4, 4]])

np.vstack((a3,a4))

In [None]:
np.hstack((a3,a4))

### Indexing, slicing, filtering

In [None]:
a1 = np.arange(1,10)
a1

In [None]:
a1[5]

In [None]:
a1[-3]

In [None]:
a1[2:5]

In [None]:
a1[5:]

In [None]:
a2 = np.arange(1,10).reshape((3,3))
a2

In [None]:
# this will not work - we have to use both dimensions
a2[5]

In [None]:
a2[1,1]

In [None]:
# you need some time getting used to such a notation (at least this was the case for me...)
a2[:,:]

In [None]:
a2[0,1:3]

In [None]:
# addressing columns/rows directly
a2[:,[0,2]]

Conditions hand back bool values that can be used for filtering/masking

In [None]:
a2 < 5

In [None]:
a2[a2<5]

the usual logic operators can be used

In [None]:
a2[(a2<5) | (a2>7)]

### Mathematical operations

In [None]:
a1 = np.arange(1,5).reshape((2,2))
a2 = np.arange(6,10).reshape((2,2))
a3 = np.ones(2)
print(f"a1 = \n{a1}")
print(f"a2 = \n{a2}")

In [None]:
a1 + a2

In [None]:
# element-wise operations - this is not matrix multiplication!
a1 * a2

In [None]:
# element-wise operations - this is not matrix multiplication!
a1 * a3

In [None]:
a1 * 2

In [None]:
a2.max()

In [None]:
a2.sum()

In [None]:
a2.sum(axis=0)

In [None]:
a2.sum(axis=1)

In [None]:
a2.T

In [None]:
# vector product (dot product)
# a1 = [[1 2]
#       [3 4]]
# a2 = [[6 7]
#       [8 9]]

np.dot(a1,a2)

## pandas

What NumPy is for numeric data, pandas is for generic data science.
You can think of the DataFrame as some kind of in-memory database with built-in data science functionality. 

In [None]:
import pandas as pd

### read and generate DataFrames

In [None]:
# read from CSV file
data_1 = pd.read_csv(r"data\country_data.csv")
data_1

In [None]:
# generate from list
list_2 = [["CHN", "China", 1398.72],
          ["IND", "India", 1351.16],
          ["USA", "United States", 329.74]]

data_2 = pd.DataFrame(list_2)
data_2

In [None]:
# beautify a bit
data_2 = pd.DataFrame(list_2, columns=["ID", "Name", "#Inhabitants"])

data_2

In [None]:
# from dictionary

dict_3 = {"ID" : ["CHN","IND","USA","IDN","BRA","PAK"],
          "Name" : ["China","India","US","Indonesia","Brazil","Pakistan"],
          "Inhabitants" : [1398.72, 1351.16, 329.74, 268.07, 210.32, 205.71]}
data_3 = pd.DataFrame(dict_3)
data_3

In [None]:
### Indexing, Slicing

In [None]:
# Indexing row or named column
data_3["Name"]

In [None]:
# Index named column od named row
data_3.Inhabitants

In [None]:
# index row
data_3.loc[3]

In [None]:
# indexing with numbers only: iloc

data_3.iloc[3]

no difference here because rows are not named - wait a second

In [None]:
# slice rows
data_3.loc[3:5]  # have a close look!

row 5 is included

In [None]:
#slice named columns
data_3.loc[:, ["ID", "Name"]]

In [None]:
data_3.iloc[:,2]

In [None]:
data_3.iloc[:,0:1]

attention: usual behaviour (1 excluded)

because now it is a number, not a name 

In [None]:
data_3.iloc[:,0:2]

In [None]:
#index/slice and filter
data_3.loc[:, ["Name"]][data_3["Inhabitants"] > 1000]

In [None]:
# delete
data_4 = data_3.drop("Name", axis="columns")
data_4

In [None]:
### mathematical operations

In [None]:
import numpy as np
# another variant to generate DataFrames
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df1

In [None]:
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df2

In [None]:
df1 + df1

In [None]:
df1 * 2

In [None]:
df1 ** 2

In [None]:
df1 + df2

In [None]:
# Funktionsanwendung (Vektor-Funktionen)

f = lambda x: x.max() - x.min()

df1.apply(f)

### built-in functions

In [None]:
df1.sum()

In [None]:
df1.sum(axis="columns")

In [None]:
# overview
df1.describe()

### Advanced (well, a bit) methods: groupby

In [None]:
data_1

In [None]:
# the first argument determines the grouping, the optional second argument what should be grouped according to the first.
data_1.groupby("CONT")["CONT"].count()

In [None]:
data_1.groupby("CONT").groups["Asia"]

In [None]:
data_1.groupby("CONT").get_group("Asia")

In [None]:
data_1.groupby("CONT")["GDP"].sum()

## Visualization

using matplotlib in combination with pandas
(would also work with numpy)

In [None]:
from matplotlib import pyplot as plt

plt.style.use('dark_background')

data_3.plot(kind="bar", x="ID", y="Inhabitants")


In [None]:
# other data
df3 = pd.DataFrame({"years": [1950, 1960, 1970, 1980, 1990, 2000, 2010],
                    "gdp": [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14958.3]})

#Standard diagramm type is the line plot:
df3.plot()

In [None]:
# that's not what we wanted to see - any thoughts?

# how does the data look like?
df3

In [None]:
# "years" should be the index
df3.set_index("years", inplace=True)
df3

In [None]:
df3.plot()

In [None]:
# That's better!
# let's enrich the data a bit...
made_up_values = [1500, 2500, 1000, 1500, 1250, 2000, 4000]
df3["some_data"] = made_up_values
df3

In [None]:
df3.plot()

In [None]:
# some different diagramms
df3.plot.bar()

In [None]:
df3.plot.bar(stacked=True)

In [None]:
df3.plot.barh()

In [None]:
df3.plot.area()

In [None]:
df3.plot.pie(subplots=True)

In [None]:
df3["gdp"].plot.pie()

In [None]:
df3.plot.scatter(x="gdp", y="some_data")

In [None]:
# not so useful, we need other data

# generate random data on height and weight
import random

mu = 170  #mean
sigma = 6  #stddev
sample = 100
random.seed(0)
height = [random.gauss(mu, sigma) for _ in range(sample)]  # normal/gaussian distribution 170cm
weight = [(h - 100) * (0.75 + random.random() / 2) for h in height]
# weight is random value between 3/4 and 5/4 of height - 100

df4 = pd.DataFrame({"height": height, "weight": weight})
df4.plot.scatter(x="height", y="weight")

Now, that's better and ends our short introduction of Python and Python data libraries.

Before we go on, some more exercises:

NumPy
1. Create a 4x2 2-D array that contains the numbers from 1-8
2. Filter out all even numbers from the array above
3. Create a new array from the first one above by reshaping it to 2x2x2. Make sure the original array is still available.
4. Create a new array from the first one by swapping the columns.
5. Do the same by swapping the middle rows.


pandas:
Use pandas to analyse the famous Iris dataset.
* Load the dataset to pandas. Make sure all is fine and consider data cleaning otherwise.
* Have a look at the dataset. What would be a feasible analysis goal? Act accordingly in the following exercises.
* Calculate statistical measures like mean, standard deviation etc.
* Use "visual analytics" to come up with hypotheses about the analysis.
* discuss with teacher and other students
