In [1]:
import glob
from tika import parser
import os
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from langdetect import detect
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.stem import PorterStemmer
import spacy
import en_core_web_sm  # or any other model you downloaded via spacy download or pip

In [2]:
nlp = en_core_web_sm.load()


pstemmer = PorterStemmer()

input_path = 'C:\\t2'
stop_words = set(stopwords.words('english'))
keywords = ['IS', 'terrorism', 'bomb', 'is', 'the', 'consortium']
filterkeywords = [w for w in keywords if w not in stop_words]
poskeywords = nltk.pos_tag(filterkeywords)

# If the first keyword is a verb, move it and reparse the list
if poskeywords[0][1] == 'VBZ':
    filterkeywords.insert(1, filterkeywords.pop(0))
    poskeywords = nltk.pos_tag(filterkeywords)

stemkeywords = nltk.pos_tag([pstemmer.stem(t) for t in filterkeywords])


# Set up Dataframe
d = pd.DataFrame()

# Create a list to use for clustering
doclist = []
word_matches = defaultdict(list)
globalents = []

In [3]:
# Use Tika to parse the file
def parsewithtika(inputfile):
    parsed = parser.from_file(inputfile)
    # Extract the text content from the parsed file
    psd = parsed["content"]
    return re.sub(r'\s+', ' ', psd)

In [4]:
# Return NLTK text from the document - used to filter out short documents but may
# also be used for further processing in future dev
def tokenmakerwords(inputfile):
    # Create tokens
    tokens = word_tokenize(inputfile)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    stripped = [w.strip(string.punctuation) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    words = [w for w in words if w not in stop_words]
    text = nltk.Text(words)
    return text

In [5]:
# Language filter
def filterlanguage(inputfile):
    if detect(inputfile) != 'en':
        return True
    return False

In [6]:
def pos(x):
    return [(token.text, token.tag_) for token in x ]

In [7]:
# find limit, go back to old version, or pipelines, or indexing, try apply
# flatten list here so don't need to do both
def spacy_pos(x):
    pos_sent = []
    for sentence in x:
        processed_spacy = nlp(sentence)
        for ent in processed_spacy.ents:
            globalents.append(ent.text + " " + ent.label_)
        pos_sent.append(pos(processed_spacy))
    return pos_sent

In [8]:
# Word tokens, parts of speech tagging
def wordtokens(dataframe):
    dataframe['words'] = (dataframe['sentences'].apply(lambda x: [word_tokenize(item) for item in x]))
    dataframe['pos'] = dataframe['sentences'].map(spacy_pos)
    dataframe['posnltk'] = dataframe['words'].apply(lambda x: [nltk.pos_tag(item) for item in x])
    dataframe['allwords'] = dataframe['words'].apply(lambda x: [item.strip(string.punctuation).lower()
                                                                for sublist in x for item in sublist])
    dataframe['allwords'] = (dataframe['allwords'].apply(lambda x: [item for item in x if item.isalpha()
                                                                    and item not in stop_words]))
    dataframe['mfreq'] = dataframe['allwords'].apply(nltk.FreqDist)
    dataframe['poslist'] = dataframe['pos'].apply(lambda x: [item for sublist in x for item in sublist])
    dataframe['mfreqpos'] = dataframe['poslist'].apply(nltk.FreqDist)
    dataframe['stemwords'] = dataframe['words'].apply(lambda x: [pstemmer.stem(item) for sublist in x
                                                                 for item in sublist])
    dataframe['stemwords'] = (dataframe['stemwords'].apply(lambda x: [item for item in x if item.isalpha()
                                                                      and item not in stop_words]))
    dataframe['mfreqstem'] = dataframe['stemwords'].apply(nltk.FreqDist)

    return dataframe

In [9]:
# Score documents based on cleansed dataset - so should discount stopwords and be sensible
def scoring(dataframe, list):
    for word in keywords:
        for idx, row in dataframe.iterrows():
            if word in row['allwords']:
                if not row['document'] in list[word]:
                    list[word].append(row['document'])
                    dataframe.loc[idx, 'score'] += (row['mfreq'][word] * 0.75)
    return dataframe

In [10]:
# Score documents based on pos - should be most exact match
def scoringpos(dataframe, list):
    for (w1, t1) in poskeywords:
        for idx, row in dataframe.iterrows():
            if (w1, t1) in row['poslist']:
                if not row['document'] in list[w1]:
                    list[w1].append(row['document'])
                    dataframe.loc[idx, 'score'] += row['mfreqpos'][(w1, t1)]
    return dataframe

In [11]:
# Score documents based on cleansed dataset - so should discount stopwords and be sensible
def scoringstem(dataframe, list):
    for word in stemkeywords:
        for idx, row in dataframe.iterrows():
            if word in row['stemwords']:
                if not row['document'] in list[word]:
                    list[word].append(row['document'])
                    dataframe.loc[idx, 'score'] += (row['mfreqstem'][word] * 0.5)
    return dataframe

In [12]:
# Find keywords using POS
def contextkeywords(dataframe):
    print('\n')
    print('Here are the exact keyword matches in context: ')
    for (w1, t1) in poskeywords:
        for idx, row in dataframe.iterrows():
            for index, r in enumerate(row['pos']):
                if (w1, t1) in r:
                    print(row['document'] + ' - ' + ' '.join(row['words'][index]))
    return dataframe

In [13]:
# Sort using a dirty model
def dirtyscoring(dataframe):
    dataframe['score2'] = 0
    dataframe['w2'] = dataframe['words'].apply(lambda x: [item for sublist in x for item in sublist])
    dataframe['mfreq2'] = dataframe['w2'].apply(nltk.FreqDist)

    word_matches = defaultdict(list)
    for word in keywords:
        for idx, row in dataframe.iterrows():
            if word in row['w2']:
                dataframe.loc[idx, 'score2'] += row['mfreq2'][word]
                if not row['document'] in word_matches[word]:
                    word_matches[word].append(row['document'])
    print('\n')
    print('The following keyword hits occurred in the uncleansed data:')

    for key, val in word_matches.items():
        print("Keyword: " + key + ". Found in these documents: ")
        print(val)

    return dataframe

In [14]:
def printkeywordmatches(list):
    for key, val in list.items():
        print("Keyword: " + key + ". Found in these documents: ")
        print(val)

In [15]:
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [pstemmer.stem(t) for t in filtered_tokens]
    return stems

In [16]:
# Cluster documents and demonstrate prediction
# TODO - calculate ideal k value
def clustering(documents):
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=0.2, use_idf=True,
                                 tokenizer=tokenize_and_stem, ngram_range=(1, 3))
    X = vectorizer.fit_transform(doclist)

    true_k = 4
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
    model.fit(X)

    print("Top terms per cluster:")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("Cluster %d:" % i),
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind]),
        print

    print("\n")
    print("Prediction")

    Y = vectorizer.transform(["this is a document about islamic state "
                              "and terrorists and bombs IS jihad terrorism isil"])
    prediction = model.predict(Y)
    print("A document with 'bad' terms would be in:")
    print(prediction)

    Y = vectorizer.transform(["completely innocent text just about kittens and puppies"])
    prediction = model.predict(Y)
    print("A document with 'good' terms would be in:")
    print(prediction)

In [17]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))


def nmflda(documentlist):
    no_features = 1000

    # NMF is able to use tf-idf
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(documentlist)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(documentlist)
    tf_feature_names = tf_vectorizer.get_feature_names()

    no_topics = 5

    # Run NMF
    nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

    # Run LDA
    lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,random_state=0).fit(tf)

    no_top_words = 10
    print("NMF Topics: ")
    display_topics(nmf, tfidf_feature_names, no_top_words)
    print("LDA Topics: ")
    display_topics(lda, tf_feature_names, no_top_words)

In [19]:
# Main loop function
# Iterate over all files in the folder and process each one in turn
print('Starting processing - the following files have been processed:')
for input_file in glob.glob(os.path.join(input_path, '*.*')):
    # Grab the file name
    filename = os.path.basename(input_file)
    fname = os.path.splitext(filename)[0]
    print(filename)

    # Parse the file to get to the text
    parsed = parsewithtika(input_file)

    # Language detection algorithm is non - deterministic, which means that if you try to run it on a text which is
    # either too short or too ambiguous, you might get different results every time you run it
    if filterlanguage(parsed):
        continue

    tokenised = tokenmakerwords(parsed)

    # Ignore any documents with <50 words
    if len(tokenised) < 100:
        continue

    # Create doclist for use in topic modelling
    doclist.append(parsed)
    # Sentence fragments
    sentences = sent_tokenize(parsed)

    # Build up dataframe
    temp = pd.Series([filename, sentences])
    d = d.append(temp, ignore_index=True)

print('\n')
d.reset_index(drop=True, inplace=True)
d.columns = ['document', 'sentences']


# Word tokenize the sentences, cleanup, parts of speech tagging
wordtokens(d)
d['score'] = 0

# Now we score in a calculated manner:
# Score 1 for matching word (case sensitive and POS)
scoringpos(d, word_matches)
# Score 0.75 for matching word (case insensitive,  stop words removed)
scoring(d, word_matches)
# Score 0.5 for matching stem of word (case insensitive, stop words removed)
scoringstem(d, word_matches)
# Print out the results of keyword matching
printkeywordmatches(word_matches)
# Find words in context with POS
contextkeywords(d)

Starting processing - the following files have been processed:
01,-,Good,bank,statement.pdf


2018-09-18 11:16:09,372 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.18/tika-server-1.18.jar.md5 to C:\Users\lisaj\AppData\Local\Temp\tika-server.jar.md5.
2018-09-18 11:16:10,003 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


031918comments2.authcheckdam.pdf
881961_CHECKLIST-2014_rev62714.pdf
bank-reconciliation-example.pdf
Bishop_Book_4_eBook.pdf
britain_mag_media_pack.pdf
c07Chemicalreactions_WEB.pdf
cassandra_thedefinitiveguide.pdf
children result( Individula and together ) v1 7-3-16.docx
Correct bank statement.pdf
D3S_EN.pdf
datascienceatthecommandline.pdf
dis5790_parrainage_mmf_a5_4.pdf
DomesticWireFunds.pdf
DTM_AprMay_2018.pdf
dubai 1 2.pdf
Early social interaction project for childen with autism   begining in the second year of life (1) 2.pdf
eng[1].htm
eula.1036.txt
Factors-Affecting-Rate-of-Reaction.pdf
Fireworks!-ConcertInPark08.pdf
HERO5Black_UM_ENG_REVC_Web.pdf
iphone  en.pdf
Kaplan, Andreas - Users of the world, unite.pdf
Kuwait job.docx
learningspark.pdf
log.txt
manual_charge_2_en_US.pdf
Memes-and-the-evolution-of-religion-We-need-memetics-too.pdf
Mohamed Salem  Religion, Spirituality and Psychiatry.pdf
MSAB_License_Management_Brazilian Portuguese.pdf
MSAB_License_Management_Chinese.pdf
MSAB_L

STARTCongressionalTestimony_StateofAQandAffiliates_WilliamBraniff.pdf - Using data provided to the Department of State , these groups were attributed responsibility for approximately 5,000 fatalities : the Taliban ( more than 2,000 fatalities ) , Boko Haram ( more than 1,100 fatalities ) , al‐Qa ’ ida in Iraq ( more than 830 fatalities ) , Tehrik‐e Taliban Pakistan ( more than 500 fatalities ) , al‐Qa ’ ida in the Arabian Peninsula ( more than 280 fatalities ) , and al‐Shabaab ( more than 280 fatalities ) .5 Based on preliminary terrorism incident data for January through June of 2013 , and again using the Department of State ’ s inclusion standards , the eight most lethal organizations in that time‐period include the Taliban , al‐Qa ’ ida in Iraq , Tehrik‐i‐Taliban Pakistan , Boko Haram , Lashkar‐e‐Jhangvi , al‐Nusrah Front , al‐Shabaab , and al‐Mua ’ qi ’ oon Biddam Brigade .
STARTCongressionalTestimony_StateofAQandAffiliates_WilliamBraniff.pdf - To help interpret these data on terro

START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf - Yes 3 2 2 1 1 2 2 3 4 6 2 1 Do you think that government´s anti-terrorism policies single out Muslims in the U.S. for increased surveillance and monitoring , or don´t you think so ?
START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf - Opinions relating to the war on terrorism Table 5 shows that in both 2007 and 2011 about half of U.S. Muslims ( 2007 49-81 % ; 2011 39-50 % ) did not believe that the war on terrorism ( WOT ) is a sincere effort to reduce international terrorism .
START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf - Two groups showed a substantial decline in doubts about the war on terrorism ( African Americans 81 % in 2007 vs. 50 % in 2011 , Iranians 66 % in 2007 vs. 40 % in 2011 ) ; indeed every origin group showed a numeric decline in doubts about war on terrorism .
START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf - The three terrorism-related items have non-negligible missing rates , raising the possibility that responses may

START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf - This deliverable is part of the National Consortium for the Study of Terrorism and Responses to Terrorism ( START ) project , “ Tracking Attitudes within American Subcultures. ” This research was supported by the Department of Homeland Security Science and Technology Directorate ’ s Office of University Programs through Award Number 2012-ST-061-CS0001 , Center for the Study of Terrorism and Behavior ( CSTAB ) 2.12 made to START to investigate the understanding and countering of terrorism within the U.S .
START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf - START uses state‐of‐the‐art theories , methods and data from the social and behavioral sciences to improve understanding of the origins , dynamics and social and psychological impacts of terrorism .
START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf - Opinions about the War on Terrorism and Suicide Bombing The survey also included severa

START_Smith_GeospatialTemporalPatternsofLoneActorTerrorism.pdf - Terrorism Research Center in Fulbright College University of Arkansas September 19 , 2014 National Consortium for the Study of Terrorism and Responses to Terrorism Methodology : Since its inception in 1988 , the ATS has used official federal terrorism-related court cases to construct a dataset for the analysis of terrorism incidents and preventions , precursor activities of these incidents , and subsequent court cases .
START_Smith_GeospatialTemporalPatternsofLoneActorTerrorism.pdf - Sources of data include : • Court case documents from federal indictments resulting from official FBI terrorism investigations .
START_TerrorismEnergyAttacks_ResearchBrief_June2015.pdf - LOCATIONS AND ATTACK TYPES As is the case with terrorism in general , attacks on energy- and mining-related targets are geographically concentrated .
START_TerrorismEnergyAttacks_ResearchBrief_June2015.pdf - START uses state‐of‐the‐art theories , methods and 

Unnamed: 0,document,sentences,words,pos,posnltk,allwords,mfreq,poslist,mfreqpos,stemwords,mfreqstem,score
0,031918comments2.authcheckdam.pdf,[ Section of Taxation Suite 400 1050 Connectic...,"[[Section, of, Taxation, Suite, 400, 1050, Con...","[[( , SP), (Section, NN), (of, IN), (Taxation,...","[[(Section, NN), (of, IN), (Taxation, NNP), (S...","[section, taxation, suite, connecticut, avenue...","{'section': 25, 'taxation': 9, 'suite': 1, 'co...","[( , SP), (Section, NN), (of, IN), (Taxation, ...","{(' ', 'SP'): 1, ('Section', 'NN'): 14, ('of',...","[section, taxat, suit, connecticut, avenu, NW,...","{'section': 25, 'taxat': 9, 'suit': 1, 'connec...",0.00
1,881961_CHECKLIST-2014_rev62714.pdf,[ CHECKLIST-2014_rev62714 ORDER CONFIRMATION C...,"[[CHECKLIST-2014_rev62714, ORDER, CONFIRMATION...","[[( , SP), (CHECKLIST-2014_rev62714, NN), (ORD...","[[(CHECKLIST-2014_rev62714, JJ), (ORDER, NNP),...","[order, confirmation, checklistorder, confirma...","{'order': 11, 'confirmation': 2, 'checklistord...","[( , SP), (CHECKLIST-2014_rev62714, NN), (ORDE...","{(' ', 'SP'): 1, ('CHECKLIST-2014_rev62714', '...","[order, confirm, checklistord, confirm, checkl...","{'order': 12, 'confirm': 4, 'checklistord': 1,...",0.00
2,bank-reconciliation-example.pdf,[ a bank Mrs Jones (Treasurer) Groovy Group Le...,"[[a, bank, Mrs, Jones, (, Treasurer, ), Groovy...","[[( , SP), (a, DT), (bank, NN), (Mrs, NNP), (J...","[[(a, DT), (bank, NN), (Mrs, NNP), (Jones, NNP...","[bank, mrs, jones, treasurer, groovy, group, l...","{'bank': 9, 'mrs': 1, 'jones': 1, 'treasurer':...","[( , SP), (a, DT), (bank, NN), (Mrs, NNP), (Jo...","{(' ', 'SP'): 1, ('a', 'DT'): 1, ('bank', 'NN'...","[bank, mr, jone, treasur, groovi, group, leice...","{'bank': 9, 'mr': 1, 'jone': 1, 'treasur': 1, ...",0.00
3,Bishop_Book_4_eBook.pdf,[ Chapter 4 - An Introduction to Chemistry: An...,"[[Chapter, 4, -, An, Introduction, to, Chemist...","[[( , SP), (Chapter, NN), (4, CD), (-, HYPH), ...","[[(Chapter, NN), (4, CD), (-, :), (An, DT), (I...","[chapter, introduction, chemistry, introductio...","{'chapter': 42, 'introduction': 19, 'chemistry...","[( , SP), (Chapter, NN), (4, CD), (-, HYPH), (...","{(' ', 'SP'): 1, ('Chapter', 'NN'): 31, ('4', ...","[chapter, An, introduct, chemistri, An, introd...","{'chapter': 42, 'An': 22, 'introduct': 19, 'ch...",0.00
4,britain_mag_media_pack.pdf,[ CHELSEA MAGAZINE COMPANY THE LTD britain-mag...,"[[CHELSEA, MAGAZINE, COMPANY, THE, LTD, britai...","[[( , SP), (CHELSEA, NNP), (MAGAZINE, NNP), (C...","[[(CHELSEA, NNP), (MAGAZINE, NNP), (COMPANY, N...","[chelsea, magazine, company, ltd, official, gl...","{'chelsea': 18, 'magazine': 32, 'company': 10,...","[( , SP), (CHELSEA, NNP), (MAGAZINE, NNP), (CO...","{(' ', 'SP'): 1, ('CHELSEA', 'NNP'): 14, ('MAG...","[chelsea, magazin, compani, ltd, As, offici, g...","{'chelsea': 18, 'magazin': 33, 'compani': 10, ...",0.00
5,c07Chemicalreactions_WEB.pdf,[ Every single living thing on Earth depends o...,"[[Every, single, living, thing, on, Earth, dep...","[[( , SP), (Every, DT), (single, JJ), (living,...","[[(Every, DT), (single, JJ), (living, NN), (th...","[every, single, living, thing, earth, depends,...","{'every': 9, 'single': 2, 'living': 2, 'thing'...","[( , SP), (Every, DT), (single, JJ), (living, ...","{(' ', 'SP'): 1, ('Every', 'DT'): 2, ('single'...","[everi, singl, live, thing, earth, depend, che...","{'everi': 9, 'singl': 2, 'live': 4, 'thing': 2...",1.00
6,cassandra_thedefinitiveguide.pdf,[ Cassandra: The Definitive Guide Jeff Carpent...,"[[Cassandra, :, The, Definitive, Guide, Jeff, ...","[[( , SP), (Cassandra, NNP), (:, :), (The, DT)...","[[(Cassandra, NN), (:, :), (The, DT), (Definit...","[cassandra, definitive, guide, jeff, carpenter...","{'cassandra': 1309, 'definitive': 11, 'guide':...","[( , SP), (Cassandra, NNP), (:, :), (The, DT),...","{(' ', 'SP'): 1, ('Cassandra', 'NNP'): 1267, (...","[cassandra, definit, guid, jeff, carpent, eben...","{'cassandra': 1304, 'definit': 32, 'guid': 24,...",0.00
7,children result( Individula and together ) v1 ...,[ The impact of adult interactive style on the...,"[[The, impact, of, adult, interactive, style, ...","[[( , SP), (The, DT), (impact, NN), (of, IN), ...","[[(The, DT), (impact, NN), (of, IN), (adult, N...","[impact, adult, interactive, style, spontaneou...","{'impact': 12, 'adult': 5, 'interactive': 5, '...","[( , SP), (The, DT), (impact, NN), (of, IN), (...","{(' ', 'SP'): 1, ('The', 'DT'): 45, ('impact',...","[impact, adult, interact, style, spontan, comm...","{'impact': 12, 'adult': 8, 'interact': 59, 'st...",0.00
8,Correct bank statement.pdf,"[ If you submit an electronic statement, the b...","[[If, you, submit, an, electronic, statement, ...","[[( , SP), (If, IN), (you, PRP), (submit, VBP)...","[[(If, IN), (you, PRP), (submit, VBP), (an, DT...","[submit, electronic, statement, bank, must, st...","{'submit': 1, 'electronic': 1, 'statement': 10...","[( , SP), (If, IN), (you, PRP), (submit, VBP),...","{(' ', 'SP'): 1, ('If', 'IN'): 2, ('you', 'PRP...","[If, submit, electron, statement, bank, must, ...","{'If': 2, 'submit': 3, 'electron': 1, 'stateme...",0.00
9,D3S_EN.pdf,[ DIGITAL CAMERA En User's Manual Printed in J...,"[[DIGITAL, CAMERA, En, User, 's, Manual, Print...","[[( , SP), (DIGITAL, NNP), (CAMERA, NNP), (En,...","[[(DIGITAL, NNP), (CAMERA, NNP), (En, NNP), (U...","[digital, camera, en, user, manual, printed, j...","{'digital': 37, 'camera': 655, 'en': 4, 'user'...","[( , SP), (DIGITAL, NNP), (CAMERA, NNP), (En, ...","{(' ', 'SP'): 1, ('DIGITAL', 'NNP'): 2, ('CAME...","[digit, camera, En, user, manual, print, japan...","{'digit': 40, 'camera': 674, 'En': 4, 'user': ...",0.00


In [18]:
# Main loop function
# Iterate over all files in the folder and process each one in turn
print('Starting processing - the following files have been processed:')
for input_file in glob.glob(os.path.join(input_path, '*.*')):
    # Grab the file name
    filename = os.path.basename(input_file)
    fname = os.path.splitext(filename)[0]
    print(filename)

    # Parse the file to get to the text
    parsed = parsewithtika(input_file)
    print(len(parsed))

Starting processing - the following files have been processed:
01,-,Good,bank,statement.pdf


2018-09-19 10:36:34,211 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


363
031918comments2.authcheckdam.pdf
34396
881961_CHECKLIST-2014_rev62714.pdf
2514
bank-reconciliation-example.pdf
1845
Bishop_Book_4_eBook.pdf
75570
britain_mag_media_pack.pdf
13616
c07Chemicalreactions_WEB.pdf
51411
cassandra_thedefinitiveguide.pdf
756403
children result( Individula and together ) v1 7-3-16.docx
66123
Correct bank statement.pdf
2576
D3S_EN.pdf
484618
datascienceatthecommandline.pdf
356858
dis5790_parrainage_mmf_a5_4.pdf
11042
DomesticWireFunds.pdf
3022
DTM_AprMay_2018.pdf
94202
dubai 1 2.pdf
69698
Early social interaction project for childen with autism   begining in the second year of life (1) 2.pdf
74913
eng[1].htm
6094
eula.1036.txt
8645
Factors-Affecting-Rate-of-Reaction.pdf
2885
Fireworks!-ConcertInPark08.pdf
10420
HERO5Black_UM_ENG_REVC_Web.pdf
73400
iphone  en.pdf
288565
Kaplan, Andreas - Users of the world, unite.pdf
47240
Kuwait job.docx
1428
learningspark.pdf
538206
log.txt
5144
manual_charge_2_en_US.pdf
56531
Memes-and-the-evolution-of-religion-We-need-mem

In [19]:
# Sort by scoring
d = d.sort_values('score', ascending=False)

# Print sorted documents
print('\n')
print('Here are the scores based on cleansed data:')
print(d[['document', 'score']])



Here are the scores based on cleansed data:
                                             document  score
56    START_CSTAB_ReactionsWaronTerrorism_Feb2017.pdf  87.25
51       STARTSymposium2015_CounterterrorismPanel.pdf  66.75
75      START_Webber_EvaluatingJihadistNarratives.pdf  47.25
59      START_DHS_SyriaBarometerSurvey_30June2016.pdf  42.75
57  START_CSTAB_USMuslimOpinionsAboutISISSyriaUSEl...  41.25
58  START_DemystifyingGrayZoneConflict_Libya_Nov20...  38.25
73  START_UnderstandingLawEnforcementIntelligenceP...  34.00
52  STARTSymposium2015_IndividualRadicalizationPan...  26.00
71  START_Smith_GeospatialTemporalPatternsofLoneAc...  18.00
74  START_UnderstandingLoneActorTerrorism_Research...  16.50
47  STARTBackgroundReport_TerrorisminOlympicsSochi...  14.75
54  START_CSTAB_ECDB_25YearsofIdeologicalHomicideV...  12.25
55  START_CSTAB_JihadiIndustryAssessingOrganizatio...  11.00
48  STARTCongressionalTestimony_StateofAQandAffili...  10.50
68  START_McCauley_PsychologyofLoneActo

In [20]:
dirtyscoring(d)

d = d.sort_values('score2', ascending=False)
print('\n')
print('Here are the scores based on uncleansed data:')
print(d[['document', 'score2']])



The following keyword hits occurred in the uncleansed data:
Keyword: IS. Found in these documents: 
['881961_CHECKLIST-2014_rev62714.pdf', 'britain_mag_media_pack.pdf', 'c07Chemicalreactions_WEB.pdf', 'cassandra_thedefinitiveguide.pdf', 'eula.1036.txt', 'HERO5Black_UM_ENG_REVC_Web.pdf', 'sql-code-smells.pdf', 'START_CSTAB_JihadiIndustryAssessingOrganizationalLeadershipCyberProfiles_July2017.pdf', 'START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf', 'START_DemystifyingGrayZoneConflict_Libya_Nov2016.pdf', 'START_ISIL_Lesson2_AnOrganizationalProfileoftheIslamicState.pdf', 'START_LessonsLearnedfromMentalHealthAndEducation_EducatorSummary_Oct2015.pdf', 'tips_users_chemicals_workplace_en.pdf']
Keyword: terrorism. Found in these documents: 
['Religion-Security-Global-Uncertainties.pdf', 'STARTBackgroundReport_TerrorisminOlympicsSochiRussia_Jan2014.pdf', 'STARTCongressionalTestimony_StateofAQandAffiliates_WilliamBraniff.pdf', 'STARTResearchBrief_Anatomizing.pdf', 'STARTResear

In [23]:
# Print results of K Means Cluster and prediction modelling
clustering(doclist)

Top terms per cluster:
Cluster 0:
 terror
 attack
 terrorist
 u.s.
 studi
 studi terror
 consortium
 respons
 nation consortium studi
 consortium studi
Cluster 1:
 set
 phone
 account
 charg
 increas
 video
 rate
 press
 select
 document
Cluster 2:
 commun
 religi
 social
 cultur
 initi
 media
 wa
 e
 user
 children
Cluster 3:
 water
 data
 order
 statement
 file
 key
 balanc
 line
 compani
 chang


Prediction
A document with 'bad' terms would be in:
[0]
A document with 'good' terms would be in:
[3]


In [24]:
# Print results of NMF vs LDA topic modelling
nmflda(doclist)



NMF Topics: 
Topic 0:
terrorism consortium study responses national homeland start security department far
Topic 1:
payment 00 account bank order statement card balance date information
Topic 2:
attacks terrorist start 2014 attack targets terrorism percent 2012 umd
Topic 3:
religion religious secular cultural et research university al human life
Topic 4:
reaction chemical acid water energy reactions sodium calcium temperature elements
LDA Topics: 
Topic 0:
tap data line command iphone use settings com http using
Topic 1:
data spark cassandra example use using cluster terrorism node chapter
Topic 2:
button camera press information al exposure focus menu mode image
Topic 3:
pre post intervention social child initiations communication 00 children use
Topic 4:
water chemical reaction information acid reactions used aq calcium equation


In [25]:
print(globalents)






In [21]:
nlp("When is is not IS?")

When is is not IS?

In [22]:
spacy_pos("When is is not IS?")

[[('W', 'NN')],
 [('h', 'PRP')],
 [('e', 'NN')],
 [('n', 'CC')],
 [(' ', 'SP')],
 [('i', 'PRP')],
 [('s', 'POS')],
 [(' ', 'SP')],
 [('i', 'PRP')],
 [('s', 'POS')],
 [(' ', 'SP')],
 [('n', 'CC')],
 [('o', 'IN')],
 [('t', 'NN')],
 [(' ', 'SP')],
 [('I', 'PRP')],
 [('S', 'NN')],
 [('?', '.')]]

In [29]:
spacy_pos(["Can you tell me why you joined IS?"])

[[('Can', 'MD'),
  ('you', 'PRP'),
  ('tell', 'VB'),
  ('me', 'PRP'),
  ('why', 'WRB'),
  ('you', 'PRP'),
  ('joined', 'VBD'),
  ('IS', 'NNP'),
  ('?', '.')]]

In [30]:
spacy_pos(["What is the matter?"])

[[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('matter', 'NN'), ('?', '.')]]