##### Reference:
1. Python PDF to Text: https://medium.com/@rqaiserr/how-to-convert-pdfs-into-searchable-key-words-with-python-85aab86c544f
2. clean text file: https://machinelearningmastery.com/clean-text-machine-learning-python/
3. count word frequency:https://programminghistorian.org/en/lessons/counting-frequencies
4. separate nouns, adjective, verbs, etc. using pos_tag in python:https://www.nltk.org/book/ch05.html
5. meaning of pos_tagged words: https://medium.com/@gianpaul.r/tokenization-and-parts-of-speech-pos-tagging-in-pythons-nltk-library-2d30f70af13b

#### The following code is grouped by four sections
Section 1: Import packages
<br/>Section 2: option 1 - Convert PDF to Text / option 2 - Read in Text file
<br/>Section 3: Pre-process the Text File
<br/>Section 4: Count Uni-gram Frequency
<br/>Section 5: Separate the part of speech (with stemming the content)
<br/>Section 6: Count Bi-gram Frequency
<br/>Section 7: Count 3-gram Frequency
<br/>Section 8: print everthing into Excel tab

### Section 1. import packages

In [1]:
import pandas as pd
import numpy as np
import nltk
import glob
import errno
import os.path
import string
import re
import PyPDF2
from collections import Counter
from openpyxl import load_workbook
import xlsxwriter
from xlsxwriter import Workbook
from nltk.tokenize import word_tokenize
from itertools import tee, islice
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hilda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hilda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hilda\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### Section 2: read the coverted text file

In [2]:
filename = 'C:\\Users\\hilda\\OneDrive\\文档\\Indiana University Bloomington\\Research Topic - Smart City Initiatives\\Frequency counts\\Bloomingtontexts\\AllinOneBloomington.txt'
file = open(filename, 'r', encoding = 'utf-8')
text = file.read()

### Section 3: clean the text. only keep alphabet characters and make them lowercase

##### 1) make lowercase, remove words less than 4 characters, remove punctuations

In [3]:
clean_text = text.replace('\n',' ').lower()   #replace \n with space, make all characters lowercase
clean_text = re.sub(r'\d+', '', clean_text) 
clean_text = re.sub(r'\b\w{1,3}\b', '', clean_text) #remove words with less than 4 characters
clean_text = clean_text.translate(str.maketrans('', '', string.punctuation+'Œœ˚˜`™˘˝˛˙—\\’“”…©––↣↢·→↡⁰•▪●'))   #remove punctuation
clean_text



###### 2) Tokenize

In [4]:
#break text string into individual words using word_tokenize() function
tokens = word_tokenize(clean_text)
tokens

['bloomington',
 'joins',
 'green',
 'building',
 'council',
 'leed',
 'cities',
 'communities',
 'program',
 'city',
 'bloomington',
 'indiana',
 'city',
 'bloomington',
 'indiana',
 'mayor',
 'john',
 'hamilton',
 'feedback',
 'type',
 'here',
 'news',
 'release',
 'bloomington',
 'joins',
 'green',
 'building',
 'council',
 'leed',
 'cities',
 'communities',
 'program',
 'share',
 'morton',
 'street',
 'suite',
 'bloomington',
 'connect',
 'facebook',
 'more',
 'information',
 'please',
 'contact',
 'alex',
 'crowley',
 'director',
 'economic',
 'sustainable',
 'development',
 'department',
 'crowleyabloomington',
 'bloomington',
 'joins',
 'green',
 'building',
 'council',
 'leed',
 'cities',
 'communities',
 'program',
 'bloomington',
 'today',
 'city',
 'bloomington',
 'announces',
 'participation',
 'green',
 'building',
 'council',
 'usgbc',
 'leed',
 'cities',
 'communities',
 'grant',
 'program',
 'process',
 'becoming',
 'leedcertiﬁed',
 'city',
 'bloomington',
 'will',
 're

###### 2) remove punctuations, stop words, and city and state names

In [5]:
#create a list of punctuations which we wish to eliminate
punctuations = ['(',')',';',':','[',']',',','-','Œ','*','$','%','/','.','&','œ','˚','~','!','@','#','˛','•','','\x0c','\ue603','\uf0e0','\uf09a','\uf099','\uf0e1','\uf098','\uf104','\uf106','\t\t','\uf0b7']
#create a list of stopwords such as "the", "a", "and", etc.
stop_words = stopwords.words('english')
#create a list of city and state names (lowercase we wish to remove
city_names = ['bloomington','bloomingtons','in','indiana','indianas']

In [6]:
#remove stopwords and punctunations from keywords
keywords = [word for word in tokens if not word in stop_words and not word in punctuations and not word in city_names and not word.startswith("https") and not word.startswith("http") and not len(word)>20]
keywords

['joins',
 'green',
 'building',
 'council',
 'leed',
 'cities',
 'communities',
 'program',
 'city',
 'city',
 'mayor',
 'john',
 'hamilton',
 'feedback',
 'type',
 'news',
 'release',
 'joins',
 'green',
 'building',
 'council',
 'leed',
 'cities',
 'communities',
 'program',
 'share',
 'morton',
 'street',
 'suite',
 'connect',
 'facebook',
 'information',
 'please',
 'contact',
 'alex',
 'crowley',
 'director',
 'economic',
 'sustainable',
 'development',
 'department',
 'crowleyabloomington',
 'joins',
 'green',
 'building',
 'council',
 'leed',
 'cities',
 'communities',
 'program',
 'today',
 'city',
 'announces',
 'participation',
 'green',
 'building',
 'council',
 'usgbc',
 'leed',
 'cities',
 'communities',
 'grant',
 'program',
 'process',
 'becoming',
 'leedcertiﬁed',
 'city',
 'receive',
 'ﬁnancial',
 'assistance',
 'educational',
 'resources',
 'technical',
 'support',
 'increase',
 'enhance',
 'practices',
 'support',
 'sustainable',
 'healthy',
 'equitable',
 'life',
 

### Section 4: count unigrem frequnecy

##### 3)count word frequency

In [7]:
#count unigram frequency
word_frequency = Counter(ngrams(keywords, 1)).most_common()
word_frequency

[(('city',), 1578),
 (('housing',), 1465),
 (('bike',), 1217),
 (('plan',), 1137),
 (('neighborhood',), 968),
 (('community',), 918),
 (('lane',), 885),
 (('public',), 848),
 (('street',), 758),
 (('transportation',), 715),
 (('development',), 671),
 (('greenway',), 633),
 (('shall',), 560),
 (('provide',), 555),
 (('bicycle',), 546),
 (('services',), 545),
 (('road',), 541),
 (('state',), 498),
 (('program',), 440),
 (('streets',), 437),
 (('traffic',), 430),
 (('protected',), 429),
 (('path',), 421),
 (('needs',), 415),
 (('walnut',), 412),
 (('downtown',), 405),
 (('local',), 378),
 (('service',), 375),
 (('also',), 366),
 (('units',), 351),
 (('facilities',), 349),
 (('areas',), 349),
 (('number',), 344),
 (('area',), 344),
 (('support',), 343),
 (('water',), 340),
 (('multi‐',), 338),
 (('data',), 336),
 (('monroe',), 335),
 (('need',), 327),
 (('control',), 325),
 (('assistance',), 322),
 (('high',), 314),
 (('parking',), 310),
 (('county',), 308),
 (('planning',), 308),
 (('affo

##### 4) make counts into a table (DataFrame)

In [8]:
unigram_frequency_pd = pd.DataFrame(word_frequency, columns=['counts', 'words'])
unigram_frequency_pd = unigram_frequency_pd[['words','counts']]

### step 5: separate nouns, verb, adjective, adverbs, etc.

##### 1) Stem words - reduce words into their basic forms. e.g. 'fished','fisher' -> 'fish'

**because not only 'cities' -> but 'community' -> 'common' too. 

In [9]:
porter = PorterStemmer()
stemmed_keywords = [porter.stem(word) for word in keywords]
stemmed_keywords

['join',
 'green',
 'build',
 'council',
 'leed',
 'citi',
 'commun',
 'program',
 'citi',
 'citi',
 'mayor',
 'john',
 'hamilton',
 'feedback',
 'type',
 'news',
 'releas',
 'join',
 'green',
 'build',
 'council',
 'leed',
 'citi',
 'commun',
 'program',
 'share',
 'morton',
 'street',
 'suit',
 'connect',
 'facebook',
 'inform',
 'pleas',
 'contact',
 'alex',
 'crowley',
 'director',
 'econom',
 'sustain',
 'develop',
 'depart',
 'crowleyabloomington',
 'join',
 'green',
 'build',
 'council',
 'leed',
 'citi',
 'commun',
 'program',
 'today',
 'citi',
 'announc',
 'particip',
 'green',
 'build',
 'council',
 'usgbc',
 'leed',
 'citi',
 'commun',
 'grant',
 'program',
 'process',
 'becom',
 'leedcertiﬁ',
 'citi',
 'receiv',
 'ﬁnancial',
 'assist',
 'educ',
 'resourc',
 'technic',
 'support',
 'increas',
 'enhanc',
 'practic',
 'support',
 'sustain',
 'healthi',
 'equit',
 'life',
 'usgbc',
 'leed',
 'program',
 'also',
 'support',
 'citi',
 'exist',
 'effort',
 'track',
 'verifi',
 'p

##### 2) separate part of speech
NN = noun, singular;VB = verb;JJ = adjective;

In [10]:
tagged_keywords = nltk.pos_tag(stemmed_keywords)  #separate part of speech
tagged_keywords

[('join', 'NN'),
 ('green', 'JJ'),
 ('build', 'NN'),
 ('council', 'NN'),
 ('leed', 'VBD'),
 ('citi', 'JJ'),
 ('commun', 'NN'),
 ('program', 'NN'),
 ('citi', 'NN'),
 ('citi', 'NN'),
 ('mayor', 'NN'),
 ('john', 'NN'),
 ('hamilton', 'NN'),
 ('feedback', 'NN'),
 ('type', 'NN'),
 ('news', 'NN'),
 ('releas', 'NNS'),
 ('join', 'VBP'),
 ('green', 'JJ'),
 ('build', 'NN'),
 ('council', 'NN'),
 ('leed', 'VBD'),
 ('citi', 'JJ'),
 ('commun', 'NN'),
 ('program', 'NN'),
 ('share', 'NN'),
 ('morton', 'NNP'),
 ('street', 'NN'),
 ('suit', 'NN'),
 ('connect', 'VBP'),
 ('facebook', 'NN'),
 ('inform', 'NN'),
 ('pleas', 'NNS'),
 ('contact', 'VBP'),
 ('alex', 'JJ'),
 ('crowley', 'NN'),
 ('director', 'NN'),
 ('econom', 'VBD'),
 ('sustain', 'JJ'),
 ('develop', 'VB'),
 ('depart', 'JJ'),
 ('crowleyabloomington', 'NN'),
 ('join', 'NN'),
 ('green', 'JJ'),
 ('build', 'NN'),
 ('council', 'NN'),
 ('leed', 'VBD'),
 ('citi', 'JJ'),
 ('commun', 'NN'),
 ('program', 'NN'),
 ('today', 'NN'),
 ('citi', 'VBP'),
 ('announc', 

##### 3) save nouns, verbs, and adjectives

In [11]:
nouns = [word for word,tag in tagged_keywords if tag in ['NN','NNP','NNS','NNPS']]
verbs = [word for word,tag in tagged_keywords if tag in ['VB','VBD','VBG','VBN','VBP','VBZ'] and not word in nouns]
adjs = [word for word,tag in tagged_keywords if tag in ['JJ','JJR','JJS']]

count POS frequency and save into dataframes

In [12]:
nouns_frequency = Counter(ngrams(nouns, 1)).most_common()
verbs_frequency = Counter(ngrams(verbs, 1)).most_common()
adjs_frequency = Counter(ngrams(adjs, 1)).most_common()

In [13]:
nouns_frequency_pd = pd.DataFrame(nouns_frequency, columns=['nouns', 'nouns'])
verbs_frequency_pd = pd.DataFrame(verbs_frequency, columns=['counts', 'verbs'])
adjs_frequency_pd = pd.DataFrame(adjs_frequency, columns=['counts', 'adjs'])

### Section 6: Count bi-gram frequency 

In [14]:
bigram_frequency = Counter(ngrams(keywords, 2)).most_common()
bigram_frequency

[(('bike', 'lane'), 796),
 (('neighborhood', 'greenway'), 619),
 (('greenway', 'neighborhood'), 486),
 (('protected', 'bike'), 395),
 (('state', 'road'), 348),
 (('multi‐', 'path'), 338),
 (('lane', 'protected'), 327),
 (('consolidated', 'plan'), 307),
 (('plan', 'control'), 251),
 (('lane', 'multi‐'), 242),
 (('monroe', 'county'), 239),
 (('affordable', 'housing'), 227),
 (('path', 'bike'), 210),
 (('road', 'state'), 168),
 (('lane', 'bike'), 157),
 (('comprehensive', 'plan'), 150),
 (('housing', 'problems'), 148),
 (('walnut', 'walnut'), 147),
 (('housing', 'units'), 111),
 (('dunn', 'dunn'), 108),
 (('lead', 'agency'), 104),
 (('bike', 'lanes'), 94),
 (('rogers', 'rogers'), 92),
 (('road', 'bypass'), 87),
 (('data', 'source'), 87),
 (('complete', 'streets'), 86),
 (('lincoln', 'lincoln'), 86),
 (('grant', 'grant'), 83),
 (('path', 'multi‐'), 81),
 (('public', 'housing'), 79),
 (('economic', 'development'), 75),
 (('attached', 'attached'), 75),
 (('solid', 'waste'), 72),
 (('smith', 

In [15]:
bigram_frequency_pd = pd.DataFrame(bigram_frequency, columns=['counts', 'words'])
bigram_frequency_pd = bigram_frequency_pd[['words','counts']]

### Section 7: count 3-gram frequency

In [16]:
trigram_frequency = Counter(ngrams(keywords, 3)).most_common()
trigram_frequency

[(('neighborhood', 'greenway', 'neighborhood'), 486),
 (('greenway', 'neighborhood', 'greenway'), 486),
 (('protected', 'bike', 'lane'), 371),
 (('lane', 'protected', 'bike'), 327),
 (('bike', 'lane', 'protected'), 326),
 (('consolidated', 'plan', 'control'), 250),
 (('bike', 'lane', 'multi‐'), 242),
 (('lane', 'multi‐', 'path'), 242),
 (('path', 'bike', 'lane'), 203),
 (('multi‐', 'path', 'bike'), 202),
 (('state', 'road', 'state'), 168),
 (('road', 'state', 'road'), 163),
 (('bike', 'lane', 'bike'), 157),
 (('lane', 'bike', 'lane'), 157),
 (('walnut', 'walnut', 'walnut'), 119),
 (('dunn', 'dunn', 'dunn'), 92),
 (('state', 'road', 'bypass'), 87),
 (('multi‐', 'path', 'multi‐'), 81),
 (('path', 'multi‐', 'path'), 81),
 (('rogers', 'rogers', 'rogers'), 77),
 (('lincoln', 'lincoln', 'lincoln'), 73),
 (('grant', 'grant', 'grant'), 68),
 (('affordable', 'housing', 'units'), 63),
 (('smith', 'smith', 'smith'), 63),
 (('road', 'bypass', 'state'), 58),
 (('bypass', 'state', 'road'), 58),
 (('

In [17]:
trigram_frequency_pd = pd.DataFrame(trigram_frequency, columns=['counts', 'words'])
trigram_frequency_pd = trigram_frequency_pd[['words','counts']]

### Section 8: print everyting into Excel tab named with the name of the city

##### 1) create a new tab named with the city name

In [18]:
f = 'Bloomington-Frequency.xlsx'

In [19]:
wb2 = load_workbook(f)
wb2.create_sheet()
wb2.save(f)

##### 2) save the Dataframes in this tab

In [20]:
writer = pd.ExcelWriter(f,engine='xlsxwriter') 
workbook=writer.book
unigram_frequency_pd.to_excel(writer,startrow=0 , startcol=0, header=True)
bigram_frequency_pd.to_excel(writer,startrow=0 , startcol=4, header=True)
trigram_frequency_pd.to_excel(writer,startrow=0 , startcol=8, header=True)
nouns_frequency_pd.to_excel(writer,startrow=0 , startcol=12, header=True)
verbs_frequency_pd.to_excel(writer,startrow=0 , startcol=16, header=True)
adjs_frequency_pd.to_excel(writer,startrow=0 , startcol=20, header=True)
writer.save()
workbook.close()