In [15]:
import os # Helps work with directories
import nltk # Helps with NLP work
from nltk.corpus import stopwords # Get our stop words
from collections import Counter # Helps with word counts
import re # Work with regular expressions
import matplotlib.pyplot as plt # Viz package
import numpy as np # numpy is used for deeper data analysis in python
import pandas as pd # Used for data analysis
import collections # Used for iterating on various datasets

import functools

### List directory

In [16]:
os.getcwd()

'C:\\Users\\dakersey\\Documents\\NLS\\NLS_Final\\Policy_Texts'

### Specify directories

In [17]:
input_dir = 'C:\\Users\\dakersey\\Documents\\NLS\\NLS_Final\\Policy_Texts'
output_dir = 'C:\\Users\\dakersey\\Documents\\NLS\\NLS_Final\\Clean_Text_New'

### Change directory

In [18]:
os.chdir(input_dir)

In [19]:
def bind_rows(dfs, ignore_index=True):
    return functools.reduce(lambda t,b: pd.concat([t,b], ignore_index=ignore_index), dfs)

### Format dataframe

- Put each policy into a row of a dataframe

In [20]:
header_list = ['Policy']
files = []

for i in os.listdir(input_dir):
    try:
        txt = os.path.join(input_dir, str(i))
        city = i.split('_')[0]
        if '_Extract' in txt:
            f = pd.read_csv(txt, names=header_list)
            f['City'] = city
            f['Policy'] = f['Policy'].astype(str)
            f = f[['City','Policy']]
            files.append(f)
    except:
        pass

df = bind_rows(files)
df = df.groupby('City', as_index=False).agg({'Policy':np.sum})

### Define functions for cleaning text

In [21]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
stopWords = list(set(stopwords.words('english')))

contractions_dict = {'didn\'t': 'did not','don\'t': 'do not',
                     'wouldn\'t': 'would not', 'won\'t': 'will not',
                    'can\'t': 'cannot', 'i\'ve': 'i have', 'i\'m': 'i am'}

def expand_contractions(s, contractions_dict=contractions_dict):
    #import re
    contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))
    
    return contractions_re.sub(replace, s)

def replace(match):
    return contractions_dict[match.group(0)]

### Clean text function
def clean_text(document):
    #import statements
    #from nltk.tokenize import word_tokenize
    #from nltk.corpus import stopwords
    document = str(document)
    docClean = document.replace('\n', ' ').replace('\r', '')  ## Newline removal
    docClean = " ".join(x.lower() for x in docClean.split()) ## Lowercase
    docClean = expand_contractions(docClean) # expand contractions
    docClean = word_tokenize(docClean) #Tokenize
    docClean = [w for w in docClean if not w in stopWords] #Drop Stop words
    docClean = [w for w in docClean if re.search('^[a-zA-Z]+', w)] #Drop numbers
    docClean = ' '.join([re.sub(r'\W+','',w) for w in docClean]) #Remove non alphanumeric chars
    docClean = re.sub('  ', ' ', docClean)
    
    return docClean.strip()

### Clean each policy

In [22]:
df['Policy_Clean'] = df['Policy'].apply(lambda x : clean_text(x))
df.drop(columns=['Policy'], inplace=True)
df.rename(columns={'Policy_Clean':'Policy'}, inplace=True)

### Define words indicative of policy presence

In [23]:
chokeholds = ["trachea", "windpipe", "throat", "neck", "chokehold", "stranglehold", "choke", "strangle", 'gag', 'suffocate',
              'muffle', 'strangulate', 'neckhold', 'head']
descalate = ["de-escalation", 'mediation', 'preparation', 'training', 'appropriate', 'consent', 'peace', 'restraint', 
             'alternative', 'advisement', 'persuasion', 'verbal', 'containment'] 
reporting = ["report", "record", "information", "data", "summary", "public", "notification", 'notify', 'annual',
                 'quarterly', 'monthly', 'manager', 'data', 'community', 'document', 'comprehensive']
intervention = ["intervene", 'intercede', "bystander", "arbitration", 'conciliation', 'negotiation',
                     "reconciliation", "intercession", "harmony", 'accord', 'professional', 'warning', 'disengagement']
tech = ["technology", "digital", "mobile", 'electronic', 'computer']
violence = ["violence", "death", "assault", "violent", 'violence', "danger", "harm", "fire", "gun", 'weapon','knife',
            'critical', 'collision', 'blood', 'baton', 'abuse', 'imminent', 'offense', 'lethal', 'discharge', 'threat', 'deadly', 'force']
mov_vehicle = ["vehicle", "car", "moving", "driving", 'wheels', 'driver']
chuk = ['head']

In [24]:
def search_words(file, words):
    counter = 0
    for i in words:
        if i in file:
            counter += 1
    return counter

### Create dimensions

In [25]:
df['chokehold_ban'] = df['Policy'].apply(lambda x : search_words(x,chokeholds))
df['descalate'] = df['Policy'].apply(lambda x : search_words(x,descalate))
df['reporting'] = df['Policy'].apply(lambda x : search_words(x, reporting))
df['intervention'] = df['Policy'].apply(lambda x : search_words(x,intervention))
df['tech'] = df['Policy'].apply(lambda x : search_words(x,tech))
df['violence'] = df['Policy'].apply(lambda x : search_words(x,violence))
df['moving_vehicle'] = df['Policy'].apply(lambda x : search_words(x, mov_vehicle))

In [26]:
df.drop(df[df['Policy']==""].index, inplace = True)
df.drop(df[df['City'].isin(pd.Series(['Stockton','SanBernardino','Lincoln','Fremont','Bakersfield','FortWorth','Jacksonville']))].index,
          inplace=True)

In [27]:
df.drop(columns=['Policy'], inplace=True)

In [28]:
df['City'].nunique()

39

In [31]:
df.to_csv('dimenstion_table.csv', index=False)