In [1]:
import glob
from tika import parser
import os
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from langdetect import detect
import pandas as pd
import string
import re
from nltk.corpus import stopwords

In [2]:
# document tokenization after text pre-preprocessing to differentiate types then token based on type

input_path = 'C:\\test'
stop_words = set(stopwords.words('english'))

# have df be document, sentences, words, pos
# do keyword searching from list
# contextualise search using pos
d = pd.DataFrame()

In [3]:
# Use Tika to parse the file
def parsewithtika(inputfile):
    parsed = parser.from_file(inputfile)
    # Extract the text content from the parsed file
    psd = parsed["content"]
    return re.sub(r'\s+', ' ', psd)

In [4]:
def tokenmakerwords(inputfile):
    # Create tokens
    tokens = word_tokenize(inputfile)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    stripped = [w.strip(string.punctuation) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    words = [w for w in words if w not in stop_words]
    text = nltk.Text(words)
    return text

In [5]:
# Language filter
def filterlanguage(inputfile):
    if detect(inputfile) != 'en':
        return True
    return False

In [73]:
# Word tokens, parts of speech tagging
def wordtokens(dataframe):
    dataframe['words'] = (dataframe['sentences'].apply(lambda x: [word_tokenize(item) for item in x]))
    dataframe['pos'] = dataframe['words'].apply(lambda x: [nltk.pos_tag(item) for item in x])
    dataframe['allwords'] = d['words'].apply(lambda x: [item.strip(string.punctuation).lower() for sublist in x for item in sublist])
    dataframe['allwords'] = (dataframe['allwords'].apply(lambda x: [item for item in x if item.isalpha()
                                                               and item not in stop_words]))
    dataframe['mfreq'] = d['allwords'].apply(nltk.FreqDist)
    return dataframe

In [6]:
# Main loop function
# Iterate over all files in the folder and process each one in turn
for input_file in glob.glob(os.path.join(input_path, '*.*')):
    # Grab the file name
    filename = os.path.basename(input_file)
    fname = os.path.splitext(filename)[0]
    print(filename)

    # Parse the file to get to the text
    parsed = parsewithtika(input_file)

    # Language detection algorithm is non - deterministic, which means that if you try to run it on a text which is
    # either too short or too ambiguous, you might get different results every time you run it
    if filterlanguage(parsed):
        continue

    tokenised = tokenmakerwords(parsed)

    # Ignore any documents with <50 words
    if len(tokenised) < 100:
        continue

    # Sentence fragments
    sentences = sent_tokenize(parsed)

    # Build up dataframe
    temp = pd.Series([filename, sentences])
    d = d.append(temp, ignore_index=True)

031918comments2.authcheckdam.pdf
881961_CHECKLIST-2014_rev62714.pdf
DomesticWireFunds.pdf
Order Confirmation.pdf
Orderconf.pdf
Patient+Type+2+opt-out+letter+v1.0.pdf
r003.pdf
Sample Grade and Receipt Documents.pdf
START_AM2014_QuickFireTwo.pdf
START_ECDB_ViolencePerpetratedbySupportersofAQAM_June2014.pdf
START_ISIL_Lesson1_ObjectivesScenariosforISIL.pdf
START_TranscendingOrganizationIndividualsandtheIslamicState_AnalyticalBrief_June2014.pdf
water-companies-letter-SoS-to-Ofwat-180131.pdf


In [7]:
d.reset_index(drop=True, inplace=True)
d.columns = ['document', 'sentences']

In [74]:
# Word tokenize the sentences, cleanup, parts of speech tagging
wordtokens(d)

Unnamed: 0,document,sentences,words,pos,allwords,mfreq
0,031918comments2.authcheckdam.pdf,[ Section of Taxation Suite 400 1050 Connectic...,"[[Section, of, Taxation, Suite, 400, 1050, Con...","[[(Section, NN), (of, IN), (Taxation, NNP), (S...","[section, taxation, suite, connecticut, avenue...","{'section': 25, 'taxation': 9, 'suite': 1, 'co..."
1,881961_CHECKLIST-2014_rev62714.pdf,[ CHECKLIST-2014_rev62714 ORDER CONFIRMATION C...,"[[CHECKLIST-2014_rev62714, ORDER, CONFIRMATION...","[[(CHECKLIST-2014_rev62714, JJ), (ORDER, NNP),...","[order, confirmation, checklistorder, confirma...","{'order': 11, 'confirmation': 2, 'checklistord..."
2,DomesticWireFunds.pdf,[ ►►►►►PLEASE PRINT◄◄◄◄◄ WIRE TRANSFER PAYMENT...,"[[►►►►►PLEASE, PRINT◄◄◄◄◄, WIRE, TRANSFER, PAY...","[[(►►►►►PLEASE, NN), (PRINT◄◄◄◄◄, NNP), (WIRE,...","[wire, transfer, payment, order, confirmation,...","{'wire': 1, 'transfer': 3, 'payment': 3, 'orde..."
3,Order Confirmation.pdf,[ Microsoft Word - Order Confirmation.doc Orde...,"[[Microsoft, Word, -, Order, Confirmation.doc,...","[[(Microsoft, NNP), (Word, NNP), (-, :), (Orde...","[microsoft, word, order, order, confirmation, ...","{'microsoft': 1, 'word': 1, 'order': 6, 'confi..."
4,Patient+Type+2+opt-out+letter+v1.0.pdf,[ (Title) (First name) (Surname) (Address line...,"[[(, Title, ), (, First, name, ), (, Surname, ...","[[((, (), (Title, NN), (), )), ((, (), (First,...","[title, first, name, surname, address, line, a...","{'title': 2, 'first': 1, 'name': 1, 'surname':..."
5,r003.pdf,[ 1 Von: auto-confirm@amazon.co.uk Gesendet: S...,"[[1, Von, :, auto-confirm, @, amazon.co.uk, Ge...","[[(1, CD), (Von, NNS), (:, :), (auto-confirm, ...","[von, gesendet, samstag, juli, betreff, order,...","{'von': 1, 'gesendet': 1, 'samstag': 1, 'juli'..."
6,Sample Grade and Receipt Documents.pdf,[ Page 1 Sample Grade and Receipt Documents Ce...,"[[Page, 1, Sample, Grade, and, Receipt, Docume...","[[(Page, NN), (1, CD), (Sample, NNP), (Grade, ...","[page, sample, grade, receipt, documents, cent...","{'page': 25, 'sample': 1, 'grade': 52, 'receip..."
7,START_AM2014_QuickFireTwo.pdf,[ U.S. Attitudes toward Terrorism and Countert...,"[[U.S., Attitudes, toward, Terrorism, and, Cou...","[[(U.S., NNP), (Attitudes, NNP), (toward, IN),...","[attitudes, toward, terrorism, counterterroris...","{'attitudes': 2, 'toward': 2, 'terrorism': 20,..."
8,START_ECDB_ViolencePerpetratedbySupportersofAQ...,[ National Consortium for the Study of Terrori...,"[[National, Consortium, for, the, Study, of, T...","[[(National, NNP), (Consortium, NNP), (for, IN...","[national, consortium, study, terrorism, respo...","{'national': 26, 'consortium': 22, 'study': 23..."
9,START_ISIL_Lesson1_ObjectivesScenariosforISIL.pdf,[ Microsoft Word - U_SMA SOCCENT White Paper F...,"[[Microsoft, Word, -, U_SMA, SOCCENT, White, P...","[[(Microsoft, NNP), (Word, NNP), (-, :), (U_SM...","[microsoft, word, soccent, white, paper, final...","{'microsoft': 1, 'word': 1, 'soccent': 1, 'whi..."


In [49]:
print(d.head())

                                 document  \
0        031918comments2.authcheckdam.pdf   
1      881961_CHECKLIST-2014_rev62714.pdf   
2                   DomesticWireFunds.pdf   
3                  Order Confirmation.pdf   
4  Patient+Type+2+opt-out+letter+v1.0.pdf   

                                           sentences  \
0  [ Section of Taxation Suite 400 1050 Connectic...   
1  [ CHECKLIST-2014_rev62714 ORDER CONFIRMATION C...   
2  [ ►►►►►PLEASE PRINT◄◄◄◄◄ WIRE TRANSFER PAYMENT...   
3  [ Microsoft Word - Order Confirmation.doc Orde...   
4  [ (Title) (First name) (Surname) (Address line...   

                                               words  \
0  [[Section, Taxation, Suite, Connecticut, Avenu...   
1  [[ORDER, CONFIRMATION, CHECKLISTORDER, CONFIRM...   
2  [[WIRE, TRANSFER, PAYMENT, ORDER, CONFIRMATION...   
3  [[Microsoft, Word, Order, Order, Confirmation,...   
4  [[Title, First, name, Surname, Address, line, ...   

                                                 pos  \

In [30]:
d['mfreq'].agg

<bound method Series.aggregate of 0     [{'section': 1, 'taxation': 1, 'suite': 1, 'co...
1     [{'order': 2, 'confirmation': 2, 'checklistord...
2     [{'wire': 1, 'transfer': 2, 'payment': 3, 'ord...
3     [{'microsoft': 1, 'word': 1, 'order': 2, 'conf...
4     [{'title': 1, 'first': 1, 'name': 1, 'surname'...
5     [{'von': 1, 'gesendet': 1, 'samstag': 1}, {'ju...
6     [{'page': 2, 'sample': 1, 'grade': 11, 'receip...
7     [{'attitudes': 2, 'toward': 2, 'terrorism': 5,...
8     [{'national': 2, 'consortium': 2, 'study': 2, ...
9     [{'microsoft': 1, 'word': 1, 'soccent': 1, 'wh...
10    [{'start': 2, 'analytical': 2, 'brief': 2, 'ju...
11    [{'rt': 1, 'hon': 1, 'michael': 1, 'gove': 1, ...
Name: mfreq, dtype: object>

In [66]:
keywords = ['isis', 'terrorism', 'bomb', 'consortium']

In [67]:
from collections import defaultdict
word_matches = defaultdict(list)
for word in keywords:
    for idx, row in d.iterrows():
        if word in row['allwords'] and not row['document'] in word_matches[word]:
            word_matches[word].append(row['document'])

for key, val in word_matches.items():
    print(key, val)


isis ['START_TranscendingOrganizationIndividualsandtheIslamicState_AnalyticalBrief_June2014.pdf']
terrorism ['START_AM2014_QuickFireTwo.pdf', 'START_ECDB_ViolencePerpetratedbySupportersofAQAM_June2014.pdf', 'START_ISIL_Lesson1_ObjectivesScenariosforISIL.pdf', 'START_TranscendingOrganizationIndividualsandtheIslamicState_AnalyticalBrief_June2014.pdf']
bomb ['START_ECDB_ViolencePerpetratedbySupportersofAQAM_June2014.pdf']
consortium ['START_AM2014_QuickFireTwo.pdf', 'START_ECDB_ViolencePerpetratedbySupportersofAQAM_June2014.pdf', 'START_TranscendingOrganizationIndividualsandtheIslamicState_AnalyticalBrief_June2014.pdf']
