In [1]:
import glob
from tika import parser
import os
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from langdetect import detect
import pandas as pd
import string
import re
from nltk.corpus import stopwords

In [2]:
# document tokenization after text pre-preprocessing to differentiate types then token based on type

input_path = 'C:\\test'
stop_words = set(stopwords.words('english'))

# have df be document, sentences, words, pos
# do keyword searching from list
# contextualise search using pos
d = pd.DataFrame()

In [3]:
# Use Tika to parse the file
def parsewithtika(inputfile):
    parsed = parser.from_file(inputfile)
    # Extract the text content from the parsed file
    psd = parsed["content"]
    return re.sub(r'\s+', ' ', psd)

In [4]:
def tokenmakerwords(inputfile):
    # Create tokens
    tokens = word_tokenize(inputfile)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    stripped = [w.strip(string.punctuation) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    words = [w for w in words if w not in stop_words]
    text = nltk.Text(words)
    return text

In [5]:
# Language filter
def filterlanguage(inputfile):
    if detect(inputfile) != 'en':
        return True
    return False

In [6]:
# Word tokens, parts of speech tagging
def wordtokens(dataframe):
    dataframe['words'] = (dataframe['sentences'].apply(lambda x: [word_tokenize(item) for item in x]))
    dataframe['pos'] = dataframe['words'].apply(lambda x: [nltk.pos_tag(item) for item in x])
    dataframe['allwords'] = d['words'].apply(lambda x: [item.strip(string.punctuation).lower() for sublist in x for item in sublist])
    dataframe['allwords'] = (dataframe['allwords'].apply(lambda x: [item for item in x if item.isalpha()
                                                               and item not in stop_words]))
    dataframe['mfreq'] = d['allwords'].apply(nltk.FreqDist)
    return dataframe

In [7]:
# Main loop function
# Iterate over all files in the folder and process each one in turn
for input_file in glob.glob(os.path.join(input_path, '*.*')):
    # Grab the file name
    filename = os.path.basename(input_file)
    fname = os.path.splitext(filename)[0]
    print(filename)

    # Parse the file to get to the text
    parsed = parsewithtika(input_file)

    # Language detection algorithm is non - deterministic, which means that if you try to run it on a text which is
    # either too short or too ambiguous, you might get different results every time you run it
    if filterlanguage(parsed):
        continue

    tokenised = tokenmakerwords(parsed)

    # Ignore any documents with <50 words
    if len(tokenised) < 100:
        continue

    # Sentence fragments
    sentences = sent_tokenize(parsed)

    # Build up dataframe
    temp = pd.Series([filename, sentences])
    d = d.append(temp, ignore_index=True)

children result( Individula and together ) v1 7-3-16.docx


2018-08-22 10:18:02,774 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


dubai 1 2.pdf
Early social interaction project for childen with autism   begining in the second year of life (1) 2.pdf
Factors-Affecting-Rate-of-Reaction.pdf
Kaplan, Andreas - Users of the world, unite.pdf
Kuwait job.docx
Memes-and-the-evolution-of-religion-We-need-memetics-too.pdf
Mohamed Salem  Religion, Spirituality and Psychiatry.pdf
MSAB_License_Management_Brazilian Portuguese.pdf
MSAB_License_Management_Chinese.pdf
MSAB_License_Management_English.pdf
MSAB_License_Management_French.pdf
MSAB_License_Management_German.pdf
MSAB_License_Management_Japanese.pdf
MSAB_License_Management_Russian.pdf
MSAB_License_Management_Spanish.pdf
MSAB_License_Management_Turkish.pdf
Periodic-Table-Chemical-Reactions-Summary1.pdf
Philosophy of Religion.pdf
Religion-Security-Global-Uncertainties.pdf
Results and comments.docx
STARTBackgroundReport_TerrorisminOlympicsSochiRussia_Jan2014.pdf
STARTCongressionalTestimony_StateofAQandAffiliates_WilliamBraniff.pdf
START_AM2014_QuickFireTwo.pdf
START_CSTAB_ECDB

In [8]:
d.reset_index(drop=True, inplace=True)
d.columns = ['document', 'sentences']

In [9]:
# Word tokenize the sentences, cleanup, parts of speech tagging
wordtokens(d)

Unnamed: 0,document,sentences,words,pos,allwords,mfreq
0,children result( Individula and together ) v1 ...,[ The impact of adult interactive style on the...,"[[The, impact, of, adult, interactive, style, ...","[[(The, DT), (impact, NN), (of, IN), (adult, N...","[impact, adult, interactive, style, spontaneou...","{'impact': 12, 'adult': 5, 'interactive': 5, '..."
1,dubai 1 2.pdf,[ Arab Social Media Report: Civil Movements: T...,"[[Arab, Social, Media, Report, :, Civil, Movem...","[[(Arab, JJ), (Social, NNP), (Media, NNP), (Re...","[arab, social, media, report, civil, movements...","{'arab': 121, 'social': 84, 'media': 65, 'repo..."
2,Early social interaction project for childen w...,"[ 03., Wetherby p67 TECSE 26:2 67–82 (2006) 67...","[[03, .], [Wetherby, p67, TECSE, 26:2, 67–82, ...","[[(03, CD), (., .)], [(Wetherby, NNP), (p67, N...","[wetherby, tecse, early, social, interaction, ...","{'wetherby': 31, 'tecse': 1, 'early': 76, 'soc..."
3,Factors-Affecting-Rate-of-Reaction.pdf,[ BY NC ND © COMPOUND INTEREST 2016 - WWW.COMP...,"[[BY, NC, ND, ©, COMPOUND, INTEREST, 2016, -, ...","[[(BY, NNP), (NC, NNP), (ND, NNP), (©, NNP), (...","[nc, nd, compound, interest, twitter, compound...","{'nc': 1, 'nd': 1, 'compound': 1, 'interest': ..."
4,"Kaplan, Andreas - Users of the world, unite.pdf","[ Author's personal copy Users of the world, u...","[[Author, 's, personal, copy, Users, of, the, ...","[[(Author, NN), ('s, POS), (personal, JJ), (co...","[author, personal, copy, users, world, unite, ...","{'author': 12, 'personal': 22, 'copy': 11, 'us..."
5,Kuwait job.docx,[ · Home · All Jobs · My Account · Blog · Cont...,"[[·, Home, ·, All, Jobs, ·, My, Account, ·, Bl...","[[(·, JJ), (Home, NNP), (·, NNP), (All, NNP), ...","[home, jobs, account, blog, contact, submit, j...","{'home': 1, 'jobs': 1, 'account': 1, 'blog': 1..."
6,Memes-and-the-evolution-of-religion-We-need-me...,[ Microsoft Word - Memes and the evolution of ...,"[[Microsoft, Word, -, Memes, and, the, evoluti...","[[(Microsoft, NNP), (Word, NNP), (-, :), (Meme...","[microsoft, word, memes, evolution, religion, ...","{'microsoft': 1, 'word': 2, 'memes': 16, 'evol..."
7,"Mohamed Salem Religion, Spirituality and Psyc...","[ Religion & Culture ‘Religion, Spirituality a...","[[Religion, &, Culture, ‘, Religion, ,, Spirit...","[[(Religion, NNP), (&, CC), (Culture, NNP), (‘...","[religion, culture, religion, spirituality, ps...","{'religion': 65, 'culture': 6, 'spirituality':..."
8,MSAB_License_Management_English.pdf,[ My Document MSAB License Management 2 MSAB O...,"[[My, Document, MSAB, License, Management, 2, ...","[[(My, PRP$), (Document, NNP), (MSAB, NNP), (L...","[document, msab, license, management, msab, of...","{'document': 1, 'msab': 133, 'license': 309, '..."
9,Periodic-Table-Chemical-Reactions-Summary1.pdf,[ Microsoft Word - Periodic Table & Chemical R...,"[[Microsoft, Word, -, Periodic, Table, &, Chem...","[[(Microsoft, NNP), (Word, NNP), (-, :), (Peri...","[microsoft, word, periodic, table, chemical, r...","{'microsoft': 1, 'word': 3, 'periodic': 4, 'ta..."


In [44]:
print(d.iloc[20])
print(d.iloc[20].mfreq.pprint(maxlen = 1000))

document            START_JihadistTerroristPlotsUS_Dec2017.pdf
sentences    [ Jihadist Terrorist Plots in the United State...
words        [[Jihadist, Terrorist, Plots, in, the, United,...
pos          [[(Jihadist, NNP), (Terrorist, NNP), (Plots, N...
allwords     [jihadist, terrorist, plots, united, states, s...
mfreq        {'jihadist': 11, 'terrorist': 5, 'plots': 35, ...
score                                                        7
Name: 20, dtype: object
FreqDist({'plots': 35, 'research': 18, 'foiled': 17, 'successful': 16, 'united': 15, 'states': 15, 'jihadist': 11, 'plot': 11, 'start': 10, 'attacks': 10, 'stage': 10, 'failed': 9, 'perpetrators': 9, 'cases': 9, 'attack': 8, 'brief': 7, 'university': 6, 'implementation': 6, 'public': 6, 'early': 6, 'discovery': 6, 'intent': 6, 'terrorist': 5, 'maryland': 5, 'completed': 5, 'countries': 5, 'whether': 5, 'number': 5, 'government': 5, 'data': 5, 'homeland': 5, 'partially': 5, 'also': 5, 'involved': 5, 'percent': 5, 'security': 5, 'd

In [11]:
d['mfreq'].agg

<bound method Series.aggregate of 0     {'impact': 12, 'adult': 5, 'interactive': 5, '...
1     {'arab': 121, 'social': 84, 'media': 65, 'repo...
2     {'wetherby': 31, 'tecse': 1, 'early': 76, 'soc...
3     {'nc': 1, 'nd': 1, 'compound': 1, 'interest': ...
4     {'author': 12, 'personal': 22, 'copy': 11, 'us...
5     {'home': 1, 'jobs': 1, 'account': 1, 'blog': 1...
6     {'microsoft': 1, 'word': 2, 'memes': 16, 'evol...
7     {'religion': 65, 'culture': 6, 'spirituality':...
8     {'document': 1, 'msab': 133, 'license': 309, '...
9     {'microsoft': 1, 'word': 3, 'periodic': 4, 'ta...
10    {'philosophy': 14, 'religion': 23, 'religious'...
11    {'open': 60, 'university': 95, 'religion': 382...
12    {'results': 27, 'chapter': 12, 'presents': 5, ...
13    {'start': 25, 'background': 11, 'report': 13, ...
14    {'microsoft': 1, 'word': 1, 'william': 2, 'bra...
15    {'attitudes': 2, 'toward': 2, 'terrorism': 20,...
16    {'years': 16, 'ideological': 49, 'homicide': 3...
17    {'nation

In [12]:
d['score'] = 0


In [13]:
keywords = ['isis', 'terrorism', 'bomb', 'consortium', 'is']

In [33]:
from collections import defaultdict
word_matches = defaultdict(list)
for word in keywords:
    for idx, row in d.iterrows():
        if word in row['allwords']:
            d.loc[idx,'score'] += row['mfreq'][word]
            if not row['document'] in word_matches[word]:
                word_matches[word].append(row['document'])
    
for key, val in word_matches.items():
    print(key, val)
    



isis ['START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf', 'START_TranscendingOrganizationIndividualsandtheIslamicState_AnalyticalBrief_June2014.pdf']
terrorism ['Religion-Security-Global-Uncertainties.pdf', 'STARTBackgroundReport_TerrorisminOlympicsSochiRussia_Jan2014.pdf', 'STARTCongressionalTestimony_StateofAQandAffiliates_WilliamBraniff.pdf', 'START_AM2014_QuickFireTwo.pdf', 'START_CSTAB_ECDB_25YearsofIdeologicalHomicideVictimizationUS_March2016.pdf', 'START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf', 'START_ECDB_ViolencePerpetratedbySupportersofAQAM_June2014.pdf', 'START_ISIL_Lesson1_ObjectivesScenariosforISIL.pdf', 'START_JihadistTerroristPlotsUS_Dec2017.pdf', 'START_TranscendingOrganizationIndividualsandtheIslamicState_AnalyticalBrief_June2014.pdf', 'START_UnderstandingLawEnforcementIntelligenceProcesses_July2014.pdf']
bomb ['Religion-Security-Global-Uncertainties.pdf', 'STARTBackgroundReport_TerrorisminOlympicsSochiRussia_Jan2014.pdf', 'START_E

In [15]:
for idx, row in d.iterrows():
    for index, r in enumerate(row['pos']):
        for (w1, t1) in r:
            if w1 == 'IS' and t1 == 'NNP':
                print(row['document'] + ' - ' +' '.join(row['words'][index]))
                print('\n')
        


START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf - Do you blame ISIS ( Islamic State of Iraq and Syria , also known as Islamic State or IS ) for making life more difficult for U.S. Muslims ?


START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf - Never thought about it U.S. Muslims should not get involved in fighting against Al Assad I would not do it myself , but I would not condemn anyone who did It ’ s morally justified to go to fight against Al Assad Joining the jihad in Syria is required for any Muslim who can do it Do you blame ISIS ( Islamic State of Iraq and Syria , also known as Islamic State or IS ) for making life more difficult for U.S. Muslims ?


START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf - [ S ] Do you blame ISIS ( Islamic State of Iraq and Syria , also known as Islamic State or IS ) for making life more difficult for U.S. Muslims ?


START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf - [ S ] Overall , 

In [59]:
d = d.sort_values('score', ascending=False)
print(d[['document', 'score']])

                                             document  score
17  START_CSTAB_USMuslimOpinionsAboutISISSyriaUSEl...    190
18  START_ECDB_ViolencePerpetratedbySupportersofAQ...     99
22  START_UnderstandingLawEnforcementIntelligenceP...     93
16  START_CSTAB_ECDB_25YearsofIdeologicalHomicideV...     40
13  STARTBackgroundReport_TerrorisminOlympicsSochi...     28
14  STARTCongressionalTestimony_StateofAQandAffili...     27
15                      START_AM2014_QuickFireTwo.pdf     27
21  START_TranscendingOrganizationIndividualsandth...     16
11         Religion-Security-Global-Uncertainties.pdf     10
20         START_JihadistTerroristPlotsUS_Dec2017.pdf      7
19  START_ISIL_Lesson1_ObjectivesScenariosforISIL.pdf      1
10                         Philosophy of Religion.pdf      1
12                          Results and comments.docx      0
23             tech & cultural appropriation 2012.pdf      0
0   children result( Individula and together ) v1 ...      0
1                       

In [58]:
d = d.sort_values('score2', ascending=False)
print(d[['document', 'score2']])

                                             document  score2
11         Religion-Security-Global-Uncertainties.pdf     446
12                          Results and comments.docx     226
22  START_UnderstandingLawEnforcementIntelligenceP...     117
17  START_CSTAB_USMuslimOpinionsAboutISISSyriaUSEl...      87
4     Kaplan, Andreas - Users of the world, unite.pdf      85
23             tech & cultural appropriation 2012.pdf      64
2   Early social interaction project for childen w...      58
18  START_ECDB_ViolencePerpetratedbySupportersofAQ...      55
1                                       dubai 1 2.pdf      51
8                 MSAB_License_Management_English.pdf      41
7   Mohamed Salem  Religion, Spirituality and Psyc...      36
0   children result( Individula and together ) v1 ...      34
13  STARTBackgroundReport_TerrorisminOlympicsSochi...      31
14  STARTCongressionalTestimony_StateofAQandAffili...      28
19  START_ISIL_Lesson1_ObjectivesScenariosforISIL.pdf      27
10      

In [51]:
d['score2'] = 0

In [56]:
d['w2'] = d['words'].apply(lambda x: [item for sublist in x for item in sublist])

In [55]:
d['mfreq2'] = d['w2'].apply(nltk.FreqDist)
d.drop('w2', axis=1, inplace=True)

In [57]:

word_matches = defaultdict(list)
for word in keywords:
    for idx, row in d.iterrows():
        if word in row['w2']:
            d.loc[idx,'score2'] += row['mfreq2'][word]
            if not row['document'] in word_matches[word]:
                word_matches[word].append(row['document'])
    
for key, val in word_matches.items():
    print(key, val)

terrorism ['START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf', 'START_ECDB_ViolencePerpetratedbySupportersofAQAM_June2014.pdf', 'START_UnderstandingLawEnforcementIntelligenceProcesses_July2014.pdf', 'START_CSTAB_ECDB_25YearsofIdeologicalHomicideVictimizationUS_March2016.pdf', 'STARTBackgroundReport_TerrorisminOlympicsSochiRussia_Jan2014.pdf', 'START_AM2014_QuickFireTwo.pdf', 'STARTCongressionalTestimony_StateofAQandAffiliates_WilliamBraniff.pdf', 'START_TranscendingOrganizationIndividualsandtheIslamicState_AnalyticalBrief_June2014.pdf', 'Religion-Security-Global-Uncertainties.pdf', 'START_JihadistTerroristPlotsUS_Dec2017.pdf']
bomb ['START_ECDB_ViolencePerpetratedbySupportersofAQAM_June2014.pdf', 'STARTBackgroundReport_TerrorisminOlympicsSochiRussia_Jan2014.pdf', 'Religion-Security-Global-Uncertainties.pdf', 'START_JihadistTerroristPlotsUS_Dec2017.pdf']
is ['START_CSTAB_USMuslimOpinionsAboutISISSyriaUSElection_June2017.pdf', 'START_ECDB_ViolencePerpetratedbySupporterso