# Language Analytics - Session 2

In [1]:
colours = ["red", "green", "blue"]

In [2]:
colours

['red', 'green', 'blue']

### ```for``` loop

In [3]:
# the name 'colour' is arbitrary and could be replaced with any word (or several words joined by an underscore)
for colour in colours: 
    print(colour)

red
green
blue


In [4]:
for colour in colours:
    print(colour.upper()+"!!")

RED!!
GREEN!!
BLUE!!


In [5]:
# different way to format ^, a little easier to read
for colour in colours:
    shouted_colour = colour.upper()+"!!"
    print(shouted_colour)

RED!!
GREEN!!
BLUE!!


In [6]:
# create new, empty list
shouted_colours = []

# for each colour in the list called colours
for colour in colours:
    # make it uppercase and add exclamation points
    shouted_colour = colour.upper()+"!!"
    # add this to the external list
    shouted_colours.append(shouted_colour)
    # print to screen
    print(shouted_colour)

RED!!
GREEN!!
BLUE!!


In [7]:
shouted_colours

['RED!!', 'GREEN!!', 'BLUE!!']

## ```if/else```

In [8]:
for colour in colours:
    if colour == "green":
        print(colour.upper()+"!!")

GREEN!!


In [9]:
for colour in colours:
    if colour == "green":
        print(colour.upper()+"!!")
    else:
        print(colour)

red
GREEN!!
blue


In [10]:
for colour in colours:
    if colour == "green":
        print(colour.upper()+"!!")
    elif colour == "red":
        print("*"+colour+"*")
    else:
        print(colour)

*red*
GREEN!!
blue


In [11]:
weird_list = ["red", "green", 10, "blue"]

In [12]:
# exception handling
for value in weird_list:
    if type(value) == int:
        print(value/2)
    else:
        print("Not a number!")

Not a number!
Not a number!
5.0
Not a number!


In [13]:
for value in weird_list:
    if type(value) == int:
        print(value/2)
    else:
        # passing over the variable and doing nothing. avoids the error message that wouldve otherwise happened when trying to divide strings
        pass

5.0


# Exception handling

In [14]:
# this can help if you are not sure exactly what variables and types are in the data set 
# tells python to try running the first line and if it encounters an error, it runs the 'except' line instead
for value in weird_list:
    try:
        print(value/2)
    except:
        print("Not a number!")

Not a number!
Not a number!
5.0
Not a number!


# Part 2 - Working with texts (strings)

In [15]:
import os

In [16]:
filepath = os.path.join("..", "..", "..", "cds-lang-data", "100_novels", "corpus", "Dickens_Expectations_1861.txt")

In [17]:
filepath

'../../../cds-lang-data/100_novels/corpus/Dickens_Expectations_1861.txt'

In [18]:
# used to see which directory im working in
# os.getcwd()

'/work/Language Analytics/cds-lang/notebooks'

In [19]:
with open(filepath) as f:
    text=f.read()

In [20]:
print(text)

REAT EXPECTATIONS
 1867 Edition 
by Charles Dickens
Chapter I
My father's family name being Pirrip, and my Christian name Philip, my
infant tongue could make of both names nothing longer or more explicit
than Pip. So, I called myself Pip, and came to be called Pip.
I give Pirrip as my father's family name, on the authority of his
tombstone and my sister, - Mrs. Joe Gargery, who married the blacksmith.
As I never saw my father or my mother, and never saw any likeness
of either of them  for their days were long before the days of
photographs , my first fancies regarding what they were like were
unreasonably derived from their tombstones. The shape of the letters on
my father's, gave me an odd idea that he was a square, stout, dark man,
with curly black hair. From the character and turn of the inscription,
"Also Georgiana Wife of the Above," I drew a childish conclusion that
my mother was freckled and sickly. To five little stone lozenges, each
about a foot and a half long, which were arr

In [21]:
# amount of characters in text
len(text)

991613

# Document length

In [22]:
tokens = text.split()

In [23]:
tokens

['\ufeffREAT',
 'EXPECTATIONS',
 '1867',
 'Edition',
 'by',
 'Charles',
 'Dickens',
 'Chapter',
 'I',
 'My',
 "father's",
 'family',
 'name',
 'being',
 'Pirrip,',
 'and',
 'my',
 'Christian',
 'name',
 'Philip,',
 'my',
 'infant',
 'tongue',
 'could',
 'make',
 'of',
 'both',
 'names',
 'nothing',
 'longer',
 'or',
 'more',
 'explicit',
 'than',
 'Pip.',
 'So,',
 'I',
 'called',
 'myself',
 'Pip,',
 'and',
 'came',
 'to',
 'be',
 'called',
 'Pip.',
 'I',
 'give',
 'Pirrip',
 'as',
 'my',
 "father's",
 'family',
 'name,',
 'on',
 'the',
 'authority',
 'of',
 'his',
 'tombstone',
 'and',
 'my',
 'sister,',
 '-',
 'Mrs.',
 'Joe',
 'Gargery,',
 'who',
 'married',
 'the',
 'blacksmith.',
 'As',
 'I',
 'never',
 'saw',
 'my',
 'father',
 'or',
 'my',
 'mother,',
 'and',
 'never',
 'saw',
 'any',
 'likeness',
 'of',
 'either',
 'of',
 'them',
 'for',
 'their',
 'days',
 'were',
 'long',
 'before',
 'the',
 'days',
 'of',
 'photographs',
 ',',
 'my',
 'first',
 'fancies',
 'regarding',
 'what

### Counting word frequencies

In [24]:
keyword = "love"

In [25]:
# this only counts "love", but not "Love", "love,", or other variations
# can be soled by making the text lowercase and removing punctuation
tokens.count(keyword)

43

In [27]:
# making the text lowercase
tokens = text.lower().split()

In [30]:
tokens.count(keyword)

46

In [31]:
import string

In [32]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [33]:
# removing punctuation from strings using list comprehension
    # for each characters in the token, if that character is not in the list of punctuations, keep it
    # ''.join is used to join the values with no white space in between. 
        # if you use '/', they will be joined with a /. '   ' joins them with 3 spaces, etc etc
# this can be done in a smarter way
# this also removes meaningful punctuation like apostrophes
cleaned_tokens = [''.join(char for char in token if char not in string.punctuation) for token in tokens]

In [34]:
cleaned_tokens

['\ufeffreat',
 'expectations',
 '1867',
 'edition',
 'by',
 'charles',
 'dickens',
 'chapter',
 'i',
 'my',
 'fathers',
 'family',
 'name',
 'being',
 'pirrip',
 'and',
 'my',
 'christian',
 'name',
 'philip',
 'my',
 'infant',
 'tongue',
 'could',
 'make',
 'of',
 'both',
 'names',
 'nothing',
 'longer',
 'or',
 'more',
 'explicit',
 'than',
 'pip',
 'so',
 'i',
 'called',
 'myself',
 'pip',
 'and',
 'came',
 'to',
 'be',
 'called',
 'pip',
 'i',
 'give',
 'pirrip',
 'as',
 'my',
 'fathers',
 'family',
 'name',
 'on',
 'the',
 'authority',
 'of',
 'his',
 'tombstone',
 'and',
 'my',
 'sister',
 '',
 'mrs',
 'joe',
 'gargery',
 'who',
 'married',
 'the',
 'blacksmith',
 'as',
 'i',
 'never',
 'saw',
 'my',
 'father',
 'or',
 'my',
 'mother',
 'and',
 'never',
 'saw',
 'any',
 'likeness',
 'of',
 'either',
 'of',
 'them',
 'for',
 'their',
 'days',
 'were',
 'long',
 'before',
 'the',
 'days',
 'of',
 'photographs',
 '',
 'my',
 'first',
 'fancies',
 'regarding',
 'what',
 'they',
 'we

In [35]:
cleaned_tokens.count(keyword)

60

In [36]:
from collections import Counter

In [37]:
# freq of every word, ordered
Counter(cleaned_tokens)

Counter({'the': 8143,
         'and': 7078,
         'i': 6484,
         'to': 5079,
         'of': 4431,
         'a': 4041,
         'in': 3025,
         'that': 2988,
         'was': 2836,
         'it': 2669,
         'he': 2208,
         'you': 2184,
         'had': 2093,
         'my': 2070,
         'me': 1996,
         'his': 1858,
         'as': 1773,
         'with': 1760,
         'at': 1639,
         'on': 1419,
         'for': 1381,
         '': 1377,
         'said': 1349,
         'her': 1172,
         'him': 1150,
         'have': 1084,
         'but': 1068,
         'not': 1067,
         'be': 1034,
         'she': 888,
         'when': 882,
         'by': 809,
         'were': 797,
         'so': 794,
         'out': 784,
         'if': 781,
         'we': 761,
         'this': 747,
         'all': 734,
         'mr': 711,
         'joe': 692,
         'there': 690,
         'is': 655,
         'no': 643,
         'what': 634,
         'up': 612,
         'been': 609,