In [1]:
import glob
from tika import parser
import os
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from langdetect import detect
import pandas as pd
import string
import re
from nltk.corpus import stopwords

In [2]:
# document tokenization after text pre-preprocessing to differentiate types then token based on type

input_path = 'C:\\test'
stop_words = set(stopwords.words('english'))

# have df be document, sentences, words, pos
# do keyword searching from list
# contextualise search using pos
d = pd.DataFrame()

In [3]:
# Use Tika to parse the file
def parsewithtika(inputfile):
    parsed = parser.from_file(inputfile)
    # Extract the text content from the parsed file
    psd = parsed["content"]
    return re.sub(r'\s+', ' ', psd)

In [4]:
def tokenmakerwords(inputfile):
    # Create tokens
    tokens = word_tokenize(inputfile)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    stripped = [w.strip(string.punctuation) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    words = [w for w in words if w not in stop_words]
    text = nltk.Text(words)
    return text

In [5]:
# Language filter
def filterlanguage(inputfile):
    if detect(inputfile) != 'en':
        return True
    return False

In [6]:
# Word tokens, parts of speech tagging
def wordtokens(dataframe):
    dataframe['words'] = (dataframe['sentences'].apply(lambda x: [word_tokenize(item.strip(string.punctuation).lower())
                                                                  for item in x]))
    dataframe['words'] = (dataframe['words'].apply(lambda x: [[item for item in lst if item.isalpha()
                                                               and item not in stop_words] for lst in x]))
    dataframe['pos'] = dataframe['words'].apply(lambda x: [nltk.pos_tag(item) for item in x])
    return dataframe

In [7]:
# Main loop function
# Iterate over all files in the folder and process each one in turn
for input_file in glob.glob(os.path.join(input_path, '*.*')):
    # Grab the file name
    filename = os.path.basename(input_file)
    fname = os.path.splitext(filename)[0]
    print(filename)

    # Parse the file to get to the text
    parsed = parsewithtika(input_file)

    # Language detection algorithm is non - deterministic, which means that if you try to run it on a text which is
    # either too short or too ambiguous, you might get different results every time you run it
    if filterlanguage(parsed):
        continue

    tokenised = tokenmakerwords(parsed)

    # Ignore any documents with <50 words
    if len(tokenised) < 100:
        continue

    # Sentence fragments
    sentences = sent_tokenize(parsed)

    # Build up dataframe
    temp = pd.Series([filename, sentences])
    d = d.append(temp, ignore_index=True)

031918comments2.authcheckdam.pdf
881961_CHECKLIST-2014_rev62714.pdf
DomesticWireFunds.pdf
Order Confirmation.pdf
Orderconf.pdf
Patient+Type+2+opt-out+letter+v1.0.pdf
r003.pdf
Sample Grade and Receipt Documents.pdf
START_AM2014_QuickFireTwo.pdf
START_ECDB_ViolencePerpetratedbySupportersofAQAM_June2014.pdf
START_ISIL_Lesson1_ObjectivesScenariosforISIL.pdf
START_TranscendingOrganizationIndividualsandtheIslamicState_AnalyticalBrief_June2014.pdf
water-companies-letter-SoS-to-Ofwat-180131.pdf


In [8]:
d.reset_index(drop=True,inplace=True)
d.columns = ['document', 'sentences']

In [10]:
# Word tokenize the sentences, cleanup, parts of speech tagging
wordtokens(d)

Unnamed: 0,document,sentences,words,pos
0,031918comments2.authcheckdam.pdf,[ Section of Taxation Suite 400 1050 Connectic...,"[[section, taxation, suite, connecticut, avenu...","[[(section, NN), (taxation, NN), (suite, NN), ..."
1,881961_CHECKLIST-2014_rev62714.pdf,[ CHECKLIST-2014_rev62714 ORDER CONFIRMATION C...,"[[order, confirmation, checklistorder, confirm...","[[(order, NN), (confirmation, NN), (checklisto..."
2,DomesticWireFunds.pdf,[ ►►►►►PLEASE PRINT◄◄◄◄◄ WIRE TRANSFER PAYMENT...,"[[wire, transfer, payment, order, confirmation...","[[(wire, NN), (transfer, NN), (payment, NN), (..."
3,Order Confirmation.pdf,[ Microsoft Word - Order Confirmation.doc Orde...,"[[microsoft, word, order, order, confirmation,...","[[(microsoft, JJ), (word, NN), (order, NN), (o..."
4,Patient+Type+2+opt-out+letter+v1.0.pdf,[ (Title) (First name) (Surname) (Address line...,"[[title, first, name, surname, address, line, ...","[[(title, NN), (first, RB), (name, JJ), (surna..."
5,r003.pdf,[ 1 Von: auto-confirm@amazon.co.uk Gesendet: S...,"[[von, gesendet, samstag], [juli, betreff, ord...","[[(von, NN), (gesendet, NN), (samstag, NN)], [..."
6,Sample Grade and Receipt Documents.pdf,[ Page 1 Sample Grade and Receipt Documents Ce...,"[[page, sample, grade, receipt, documents, cen...","[[(page, NN), (sample, NN), (grade, VBD), (rec..."
7,START_AM2014_QuickFireTwo.pdf,[ U.S. Attitudes toward Terrorism and Countert...,"[[attitudes, toward, terrorism, counterterrori...","[[(attitudes, NNS), (toward, IN), (terrorism, ..."
8,START_ECDB_ViolencePerpetratedbySupportersofAQ...,[ National Consortium for the Study of Terrori...,"[[national, consortium, study, terrorism, resp...","[[(national, JJ), (consortium, NN), (study, NN..."
9,START_ISIL_Lesson1_ObjectivesScenariosforISIL.pdf,[ Microsoft Word - U_SMA SOCCENT White Paper F...,"[[microsoft, word, soccent, white, paper, fina...","[[(microsoft, JJ), (word, NN), (soccent, NN), ..."


In [13]:
print(d.head())

                                 document  \
0        031918comments2.authcheckdam.pdf   
1      881961_CHECKLIST-2014_rev62714.pdf   
2                   DomesticWireFunds.pdf   
3                  Order Confirmation.pdf   
4  Patient+Type+2+opt-out+letter+v1.0.pdf   

                                           sentences  \
0  [ Section of Taxation Suite 400 1050 Connectic...   
1  [ CHECKLIST-2014_rev62714 ORDER CONFIRMATION C...   
2  [ ►►►►►PLEASE PRINT◄◄◄◄◄ WIRE TRANSFER PAYMENT...   
3  [ Microsoft Word - Order Confirmation.doc Orde...   
4  [ (Title) (First name) (Surname) (Address line...   

                                               words  \
0  [[section, taxation, suite, connecticut, avenu...   
1  [[order, confirmation, checklistorder, confirm...   
2  [[wire, transfer, payment, order, confirmation...   
3  [[microsoft, word, order, order, confirmation,...   
4  [[title, first, name, surname, address, line, ...   

                                                 pos  \

In [12]:
d['mfreq'] = d['words'].apply(lambda x: [nltk.FreqDist(item) for item in x])

In [14]:
for word, frequency in d['mfreq'].most_common(50):
    print(u'{};{}'.format(word, frequency))

AttributeError: 'Series' object has no attribute 'most_common'