In [32]:
import json
import operator
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk import Tree
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [33]:
def preprocess(doc):
    sent_list = sent_tokenize(doc)
    sent_tagged = []
    for sent in sent_list:
        words = nltk.word_tokenize(sent)
        sent_tagged.append(nltk.pos_tag(words))
    return sent_tagged

In [34]:
def parse(sent_tagged):
    grammar = r"""
      NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
      PP: {<IN><NP>}               # Chunk prepositions followed by NP
      VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments

      """
    #      CLAUSE: {<NP><VP>}           # Chunk NP, VP
    chunkParser = nltk.RegexpParser(grammar)
    trees = [chunkParser.parse(sublist) for sublist in sent_tagged]
    return trees

In [35]:
def phrase(trees):
    continuous_chunk = []
    current_chunk = []

    for t in trees:
        for subtree in t:
            if type(subtree) == Tree:
                current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
            elif current_chunk:
                named_entity = " ".join(current_chunk)
                if named_entity not in continuous_chunk:
                    continuous_chunk.append(named_entity)
                    current_chunk = []
            else:
                continue
    return continuous_chunk

In [36]:
word_list = []
keywords = open('keywords.txt',"r")
for line in keywords:
    w = line
    if w[-1] == '\n':
        w = w[:-1]
    word_list.append(w)
print (word_list)

['diverse', 'inclusion', 'inclusive', 'female', 'women', 'diversity', 'underrepresent', 'underrepresented', 'black', 'hispanic', 'latin', 'native', 'veterans', 'veteran', 'leadership', 'culture ', 'equal', 'equality', 'minorities', 'LGBTQ', 'rights', 'sexual', 'gender', 'discriminated', 'discriminate', 'freedom', 'disabilities', 'age', 'inclusive', 'african', 'indians', 'muslim', 'workforce', 'workplace', 'backgrounds', 'bias', 'differences', 'race', 'color', 'ancestry', 'sex', 'perspective']


In [37]:
def extract_phrase(doc):
    return phrase(parse(preprocess(doc)))

In [38]:
f = open("wal.txt","r") 
text = f.read()
#res = extract_phrase(text)
sent_list = sent_tokenize(text)
sent_list = [s.lower() for s in sent_list]
diverse_sent_list = []
for sent in sent_list:
    diverse_count = 0
    words = sent.split()
    for w in words:
        if w.lower() in word_list:
            diverse_count += 1
    if diverse_count > 0:
        diverse_sent_list.append(sent)
print (diverse_sent_list)
#print (res)

['we’re proud to support them, too, by offering jobs at all levels – and investing in our workforce through training and skills development so that entry-level jobs lead to careers.', 'service to the customer customer first: listen to, anticipate and serve customer wants and needs frontline focused: support and empower associates to serve customers every day\ninnovative and agile: be creative, take smart risks and move with speed\nrespect for the individual\nlisten: be visible and available, collaborate with others and be open to feedback\nlead by example: be humble, teach and trust others to do their jobs, give honest and direct feedback\ninclusive: seek and embrace differences in people, ideas and experiences\nstrive for excellence\n\nhigh performance: set and achieve aggressive goals\naccountable: take ownership, celebrate successes and be responsible for results\nstrategic: make clear choices, anticipate changing conditions and plan for the future\nact with integrity\n\nhonest: tel

In [39]:
######## bi-gram phrases###########
def extract(sent_tagged):  
    phrases = []
    for words in sent_tagged:        
        for k in range(0,len(words)-2):
            p1 = ''
            p2 = ''
            if( words[k][1]=="JJ" and words[k+1][1]=="JJ" and words[k+2][1]!="NN" and words[k+2][1]!="NNS"):
                p1 = words[k][0]
                p2 = words[k+1][0]
#                phrases.append(words[k][0] + ' ' + words[k+1][0])
            elif( words[k][1]=="JJ" and words[k+1][1]=="NN" ) or ( words[k][1]=="JJ" and words[k+1][1]=="NNS" ):
                p1 = words[k][0]
                p2 = words[k+1][0]
 #               phrases.append(words[k][0] + ' ' + words[k+1][0])
            elif( words[k][1]=="NN" and words[k+1][1]=="JJ" and words[k+2][1]!="NN" and words[k+2][1]!="NNS") or ( words[k][1]=="NNS" and words[k+1][1]=="JJ" and words[k+2][1]!="NN" and words[k+2][1]!="NNS"):
                p1 = words[k][0]
                p2 = words[k+1][0]
#                phrases.append(words[k][0] + ' ' + words[k+1][0])
            elif( words[k][1]=="RB" and words[k+1][1]=="JJ" and words[k+2][1]!="NN" and words[k+2][1]!="NNS") or ( words[k][1]=="RBR" and words[k+1][1]=="JJ" and words[k+2][1]!="NN" and words[k+2][1]!="NNS") or ( words[k][1]=="RBS" and words[k+1][1]=="JJ" and words[k+2][1]!="NN" and words[k+2][1]!="NNS"):
                p1 = words[k][0]
                p2 = words[k+1][0]
#                phrases.append(words[k][0] + ' ' + words[k+1][0])
            if len(p1) > 1 and len(p2) > 1:
                phrases.append(p1.lower() + ' ' + p2.lower())
    phrases = list(set(phrases))
    return phrases
          

def extract_pattern(doc):
    return extract(preprocess(doc))

In [40]:
extract_pattern(text)

['common purpose',
 'medical expenses',
 'comprehensive training',
 'first day',
 'many forms',
 'additional options',
 'clear choices',
 'eligible associates',
 'immersive training',
 'next step',
 'general merchandise',
 'tough calls',
 'better able',
 'retail industry',
 'free access',
 'preventive care',
 'medical condition',
 'fiscal year',
 'fresh fruits',
 'parental leave',
 'annual maximum',
 'continued support',
 'soft skills',
 'exclusive discounts',
 'superior customer',
 'eligible network',
 'long history',
 'entry-level positions',
 'medical plans',
 'competitive pay',
 'free confidential',
 'accidental death',
 'professional development',
 'associate discount',
 'frontline work',
 'holiday time',
 'enhanced maternity',
 'new parent',
 'affordable options',
 'foster-care placement',
 'two-week family',
 'working supercenter',
 're investing',
 'higher-paying jobs',
 'personal time',
 'legal parents',
 'sick time',
 'increased responsibility',
 'high performance',
 'aggress

In [11]:
###### apple ##########

f = open("apple.txt","r") 
text2 = f.read()
extract_pattern(text)
      
##### bi-gram ###### 
 
  

['common purpose',
 'medical expenses',
 'comprehensive training',
 'first day',
 'many forms',
 'additional options',
 'clear choices',
 'eligible associates',
 'immersive training',
 'next step',
 'general merchandise',
 'tough calls',
 'better able',
 'retail industry',
 'free access',
 'preventive care',
 'medical condition',
 'fiscal year',
 'fresh fruits',
 'parental leave',
 'annual maximum',
 'continued support',
 'soft skills',
 'exclusive discounts',
 'superior customer',
 'eligible network',
 'long history',
 'entry-level positions',
 'medical plans',
 'competitive pay',
 'free confidential',
 'accidental death',
 'professional development',
 'associate discount',
 'frontline work',
 'holiday time',
 'enhanced maternity',
 'new parent',
 'affordable options',
 'foster-care placement',
 'two-week family',
 'working supercenter',
 're investing',
 'higher-paying jobs',
 'personal time',
 'legal parents',
 'sick time',
 'increased responsibility',
 'high performance',
 'aggress

In [12]:
extract_phrase(text2)

###### NP, VP #######

['Inclusion',
 'Diversity Inclusion',
 'DiversityApple Open',
 'Humanity',
 'plural',
 'singular',
 'The',
 'way the world',
 'everybody',
 'Nobody',
 'the film A few updates',
 'd',
 'share',
 'Diverse teams',
 'innovation possible',
 'female representation',
 're proud of the progress',
 're making',
 'For example',
 'percent',
 'employees',
 'women',
 'That ’',
 'an increase',
 'percentage points',
 'Women at Apple',
 '%',
 '% %',
 '% % %',
 '% % years old Overall representation Grace',
 'software engineer',
 'laptop',
 'Quote',
 'a product',
 'work for a big group of people',
 'by a diverse group of people',
 'Grace',
 'software engineer Read Grace ’ s story',
 'on the contributions',
 'outstanding women leaders',
 'Twenty-nine percent',
 'leaders',
 'women a',
 'percentage point increase from July',
 'July',
 'during that same period of time',
 'the percentage of leaders',
 'women percentage points',
 'future generation of leaders',
 'an',
 'percentage of women',
 'Today',
 'perce

In [13]:
f = open("apple.txt","r") 
text2 = f.read()
sent_list = sent_tokenize(text2)
sent_list = [s.lower() for s in sent_list]
diverse_sent_list = []
for sent in sent_list:
    diverse_count = 0
    words = sent.split()
    for w in words:
        if w.lower() in word_list:
            diverse_count += 1
    if diverse_count > 0:
        diverse_sent_list.append(sent)
print (diverse_sent_list)

['\ninclusion & diversity\ninclusion & diversityapple\nopen.', 'diverse teams make innovation possible.', 'our female representation is steadily increasing, and we’re proud of the progress we’re making.', 'women at apple\n\n30%\n31%\n32%\n32%\n31%\n33%\n35%\n36%\n2014\n2015\n2016\n2017\nunder 30 years old\noverall representation\ngrace, software engineer, working on her laptop.', 'quote by\nif we want a product to appeal to and work for a big group of people, it needs to be built by a diverse group of people.', 'grace, software engineer\nread grace’s story\nwe depend on the contributions from our outstanding women leaders.', 'and during that same period of time, the percentage of leaders under 30 who are women has increased by 3 percentage points.', '29%\nof leaders at apple are women\n39%\nof our leaders under 30 are women\nkim, vice president of o.s.', 'quote by\ni’m passionate about welcoming more women into the tech industry.', 'kim, vice president of os programs\nread kim’s story\

In [14]:
import sys
import json
import gzip
import operator
import glob
from multiprocessing import Process
import zipfile
import io
import numpy as np
import pandas as pd


In [17]:
folder = '/home/ubuntu/diversity_mining/Letters_Raw/'
for filename in glob.glob(folder + '*.txt'):
#    print (filename)
    inputfile = open(filename,'r')
    doc = inputfile.read()
    print (doc)
    
    
    
    break

2017 Annual Report

BOARD OF DIRECTORS

 TIMOTHY J. BERNLOHR
Managing Member
TJB Management Consulting, LLC
Executive Committee, Compensation Committee,
Nominating and Corporate Governance Committee

 J. POWELL BROWN
President and Chief Executive Officer
Brown & Brown, Inc.
Audit Committee, Finance Committee

MICHAEL E. CAMPBELL
Former Chairman, President
and Chief Executive Officer
Arch Chemicals, Inc.
Compensation Committee,  
Nominating and Corporate Governance
Committee

 TERRELL K. CREWS
Former Executive Vice President
and Chief Financial Officer
Monsanto Corporation
Audit Committee, Finance Committee

RUSSELL M. CURREY
President
Boxwood Capital, LLC
Audit Committee, Finance Committee

 JOHN A. LUKE JR.
(Non-Executive Chairman)
Former Chairman and Chief Executive Officer
MeadWestvaco Corporation
Executive Committee

GRACIA C. MARTORE
Former President and Chief Executive Officer
TEGNA, Inc.
Executive Committee, Audit Committee,
Compensation Committee

 JAMES E. NEVELS
(Lead Indepe

In [18]:
ph = extract_pattern(doc)

In [19]:
len(ph)

2156

In [20]:
len(set(ph))

2156

In [21]:
ph

['fiscal years',
 'sufficient information',
 'effective date',
 'foreign subsidiaries',
 'global equity',
 'public companies',
 'corporate credit',
 'such proceedings',
 'interim goodwill',
 'such matters',
 'fine particulate',
 'many shares',
 'certain employees',
 'fact necessary',
 'public bond',
 'medical cost',
 'rates applicable',
 'such accruals',
 'year-to-date lump',
 'commercial card',
 'total reclassifications',
 'third-party disposal',
 'adversely affected',
 'cap-and-trade requirements',
 'similar programs',
 'obligations due',
 'single-employer pension',
 'greenfield containerboard',
 'years due',
 'complex laws',
 'other parties',
 'same reason',
 'collective bargaining',
 'other materials',
 'complex judgments',
 'other regulation',
 'following items',
 'third-party transactions',
 'basic earnings',
 'anticipated changes',
 'underlying accounts',
 'non-service cost',
 'following table',
 'environmental matters',
 'interrelated rulemakings',
 'certainty economic',
 'unde

In [43]:
folder = '/home/ubuntu/diversity_mining/Diversity_Data/'
#count = 0
phrase_list = []
for filename in glob.glob(folder + '*.txt'):
#    print (filename)
    inputfile = open(filename, 'r')
    
    text2 = inputfile.read()
    sent_list = sent_tokenize(text2)
    sent_list = [s.lower() for s in sent_list]
    diverse_sent_list = []
    for sent in sent_list:
        diverse_count = 0
        words = sent.split()
        for w in words:
            if w.lower() in word_list:
                diverse_count += 1
        if diverse_count > 0:
            diverse_sent_list.append(sent)
#    print (diverse_sent_list)
    sent_tagged = []
    for sent in diverse_sent_list:
        words = nltk.word_tokenize(sent)
        sent_tagged.append(nltk.pos_tag(words))
    


#     inputfile = open(filename,'r')
#     doc = inputfile.read()
    phrase_list.extend(extract(sent_tagged))
    

In [44]:
len(phrase_list)

3274

In [45]:
folder = '/home/ubuntu/diversity_mining/Letters_Raw/'
for filename in glob.glob(folder + '*.txt'):
    inputfile = open(filename, 'r')
    
    text2 = inputfile.read()
    sent_list = sent_tokenize(text2)
    sent_list = [s.lower() for s in sent_list]
    diverse_sent_list = []
    for sent in sent_list:
        diverse_count = 0
        words = sent.split()
        for w in words:
            if w.lower() in word_list:
                diverse_count += 1
        if diverse_count > 0:
            diverse_sent_list.append(sent)
#    print (diverse_sent_list)
    sent_tagged = []
    for sent in diverse_sent_list:
        words = nltk.word_tokenize(sent)
        sent_tagged.append(nltk.pos_tag(words))
    


#     inputfile = open(filename,'r')
#     doc = inputfile.read()
    phrase_list.extend(extract(sent_tagged))
#    print (filename)
#     inputfile = open(filename,'r')
#     doc = inputfile.read()
#     phrase_list.extend(extract_pattern(doc))
    

In [46]:
len(phrase_list)

34313

In [47]:
phrase_list = list(set(phrase_list))

In [51]:
len(phrase_list)

22031

In [49]:
phrase_list[:100]

['integrated water',
 'cutaneous manifestations',
 'smith brent',
 'eric charbonneau',
 'ullom james',
 'share/restricted share',
 'alex kasee',
 'corral quezada',
 'stezowski jr',
 'female consumers',
 'bryan kaai',
 'citibank capital',
 'average number',
 'jellison vice',
 'hayden richards',
 'michelle hatfield',
 'nebraska gainey',
 'leigh workman',
 'intelligent platform',
 'non-taxable return',
 'small differences',
 'indicated loss',
 'acquired inventory',
 'basic earnings',
 'extensive board',
 'nonperforming asset',
 'global conventional',
 'hsbc north',
 'undue hardship',
 'global focus',
 'chinese channel',
 'fiscal reporting',
 'further performance',
 'employee-giving campaigns',
 'colorectal screening',
 'christian gallups',
 'reinhardt john',
 'subramanian yogesh',
 'new chairs',
 'anderson tanner',
 's. chen�6',
 'multiple platforms',
 'financial experience',
 'different bonus',
 'other differences',
 'stewart edward',
 'john c.',
 'wright matthew',
 'pro rata',
 'colleen

In [50]:
f = open('dic_v1_1.txt',"w")
for s in phrase_list:
    f.write(s)
    f.write('\n')

In [62]:
from nltk.corpus import brown
from nltk import WordNetLemmatizer
from math import log 
wnl=WordNetLemmatizer()
from nltk.corpus import *
nltk.download('gutenberg')
nltk.download('brown')

[nltk_data] Downloading package gutenberg to /home/ubuntu/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package brown to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [66]:
genre=['editorial', 'fiction', 'government', 'hobbies',
'humor', 'learned', 'lore', 'mystery', 'news', 'reviews', 'romance',
'science_fiction']
sents=gutenberg.sents(gutenberg.fileids())+brown.sents(categories=genre)

In [67]:
len(sents)

142330

In [69]:
sents[100]

['You',
 'do',
 'not',
 'think',
 'I',
 'could',
 'mean',
 '_you_',
 ',',
 'or',
 'suppose',
 'Mr',
 '.',
 'Knightley',
 'to',
 'mean',
 '_you_',
 '.']

In [72]:
for sent in sents:
        
        sent_tagged.append(nltk.pos_tag(sent))
phrases = extract(sent_tagged)

In [73]:
len(phrases)

82038

In [89]:
unrelated = []
for p in phrases:
    w = p.split()
    if w[0].lower() in word_list or w[1].lower() in word_list:
        continue
    if w[0].isalpha() and w[1].isalpha():
        unrelated.append(p)


In [90]:
len(unrelated)

75867

In [91]:
unrelated = list(set(unrelated))

In [92]:
len(unrelated)

75867

In [94]:
neg = []
for i in range(75867):
    if i % 8 == 0:
        neg.append(unrelated[i])

In [95]:
neg

['basophilic nuclei',
 'however incredible',
 'blemish daily',
 'next sentence',
 'actually incapable',
 'next century',
 'loose stone',
 'single sunset',
 'partisan year',
 'new house',
 'final battle',
 'cursed world',
 'dirty ones',
 'entire personnel',
 'many iournies',
 'upon condition',
 'orthodontic appliances',
 'more evident',
 'little scissors',
 'final transcription',
 'whole sera',
 'bay hunter',
 'relative frequencies',
 'mutual funds',
 'utterly incurable',
 'few macaroons',
 'sound economies',
 'specific adjustment',
 'however preposterous',
 'external system',
 'angry pleasure',
 'little portion',
 'right frame',
 'altogether alien',
 'real line',
 'fragmentary phrases',
 'constant arc',
 'antagonistic influences',
 'superb lofty',
 'actual attained',
 'other landmarks',
 'respective duties',
 'brazen lamp',
 'exquisite realization',
 'british corporation',
 'other burden',
 'atavistic sense',
 'vaguely possible',
 'entire nation',
 'direct path',
 'girls eat',
 'foolis

In [96]:
f = open('unrelated.txt',"w")
for s in neg:
    f.write(s)
    f.write('\n')

In [113]:
pos = []
f = open('initial.txt','r')
for p in f:
    w = p.split()
    if len(w) == 2:
        if p[-1] == '\n' and '*' not in p:
            pos.append(p[:-1].lower())
        

In [114]:
pos

['resource groups',
 'targeted recruiting',
 'targeted recruitment',
 'equal access',
 'mentoring program',
 'diversity training',
 'diversity goal',
 'selection systems',
 'active participation',
 'education programs',
 'active engagement',
 'safe space',
 'equal selection',
 'equal opportunity',
 'diverse applicants',
 'smooth integration',
 'fair practices',
 'diverse leadership',
 'diverse incumbents',
 'affirmative action',
 'equal opportunity',
 'fair distribution',
 'labor rights',
 'equal wages',
 'pay gap',
 'fair rewards',
 'fair recognition',
 'equal work',
 'demographic group',
 'address concerns',
 'no discrimination',
 'personnel action',
 'developmental opportunity',
 'incident reports',
 'identity disclosure',
 'employee programs',
 'every employee',
 'significant resources',
 'staff time',
 'diverse workforce',
 'open forum',
 'managerial support',
 'diverse programs',
 'group effort',
 'diverse statistics',
 'addressed concerns',
 'diversity initiative',
 'compelling 

In [115]:
f = open('dic_v2_0.txt',"r")
for s in f:
    if s[-1] == '\n':
        s = s[:-1]
    pos.append(s)

In [116]:
len(pos)

612

In [117]:
pos = list(set(pos))

In [118]:
len(pos)

583

In [119]:
pos

['different programs',
 'free resources',
 'diverse strategies',
 'inclusive teams',
 'little different',
 'various information',
 'collective distribution',
 'cultural differences',
 'culture different',
 'periods different',
 'show courtesy',
 'different approach',
 'free expression',
 'varying expertise',
 'potential involvement',
 'varying mindsets',
 'actual differences',
 'active programs',
 'principal differences',
 'improved acceptance',
 'various environmental',
 'entirely different',
 'sustainable difference',
 'active employee',
 'different experience',
 'diverse knowledge',
 'many different',
 'diverse skills',
 'cumulative difference',
 'increased understanding',
 'various opportunities',
 'significant involvement',
 'usually different',
 'primary difference',
 'integrate knowledge',
 'relative difference',
 'inclusive thinking',
 'continuing involvement',
 'therefore different',
 'collective efforts',
 'various integrated',
 'small differences',
 'diverse distribution',
 

In [120]:
f = open('related.txt',"w")
for s in pos:
    f.write(s)
    f.write('\n')

In [121]:
cand = []
f = open('dic_v1_0.txt',"r")
for p in f:
    if p[-1] == '\n':
        p = p[:-1]
    ws = p.split()
    if ws[0].isalpha() and ws[1].isalpha():
        cand.append(p.lower())

In [122]:
len(cand)

83719

In [123]:
f = open('cand.txt',"w")
for s in cand:
    f.write(s)
    f.write('\n')