In [48]:
import spacy
import os
import pandas as pd
from spacy.matcher import Matcher
from google.colab import files

In [49]:
Condition = ['depending', 'necessary', 'appropriate',
                 'inappropriate', 'as needed', 'as applicable',
                 'otherwise reasonably', 'sometimes',
                 'from time to time']
Generalization = ['generally', 'mostly', 'widely',
                  'general', 'commonly',
                  'usually', 'normally', 'typically',
                  'largely', 'often', 'primarily',
                  'among other things']
Modality = ['may', 'might', 'can', 'could', 'would',
            'likely', 'possible', 'possibly']
Numeric_quantifier = ['anyone', 'certain', 'everyone',
                      'numerous', 'some', 'most', 'few',
                      'much', 'many', 'various',
                      'including but not limited to']

In [50]:
Path = "/content/Costco/"
filelist = os.listdir(Path)
txt_files = []
file_names = []
for i in filelist:
    if i.endswith(".txt"):
        file_names.append(i)
        with open(Path + i, 'r') as f:
            txt_files.append(f.read())

## Bradley-Terry Coeff

In [51]:
bt_coef ={
    "CN": 1.619,
    "C": 1.783,
    "CM": 1.864,
    "CMN": 2.125,
    "CG": 2.345,
    "CGN": 2.443,
    "MN": 2.569,
    "N": 2.710,
    "M": 2.865,
    "CGMN": 2.899,
    "CGM": 2.968,
    "GN": 3.281,
    "GMN": 3.506,
    "G": 3.550,
    "GM": 4.045
}

## Policy String

In [52]:
Policies = pd.DataFrame([file_names, txt_files]).T
Policies.columns = ["File_name", "text_content"]

In [53]:
Policies

Unnamed: 0,File_name,text_content
0,Costco.txt,PERSONAL INFORMATION WE COLLECT\nThe personal ...


In [54]:
# with open("/content/Telegram.txt", 'r') as f:
#   text = f.read()
# print(text)

## Index Tracking


In [55]:
def len_str(x):
  return len(x)

In [56]:
keepWords = Condition + Generalization + Modality + Numeric_quantifier
# print(keepWords)

index_tracker = {}
for num, i in enumerate(keepWords):
  index_tracker[i] = num
_ = keepWords.sort(key=len_str, reverse=True)

## Matcher

In [57]:
nlp = spacy.load('en')

matcher = Matcher(nlp.vocab)
for i in keepWords:
  rule = [ {"LOWER": j } for j in i.split() ]
  matcher.add(i, None, rule)

## Functions

In [58]:
def generateVec(sentence):
  text = nlp(sentence)

  final = []
  category_vaguesness = {"C": 0, "G": 0, "M": 0, "N": 0}
  for i in range(len(keepWords)):
    final.append(0)

  buffer_start = -1
  for word, match_start, match_end in matcher(text):
    if buffer_start < match_start:
      # print(nlp.vocab.strings[word])
      final[index_tracker[nlp.vocab.strings[word]]] += 1
      if nlp.vocab.strings[word] in Condition:
        category_vaguesness["C"] = 1
      elif nlp.vocab.strings[word] in Modality:
        category_vaguesness["M"] = 1
      elif nlp.vocab.strings[word] in Numeric_quantifier:
        category_vaguesness["N"] = 1
      else:
        category_vaguesness["G"] = 1 
    buffer_start = match_end - 1

  if buffer_start==-1:
    return None
  
  temp = "".join([ i for i in category_vaguesness if category_vaguesness[i] ])
  final.append(temp)
  final.append(bt_coef[temp])

  return final

In [59]:
def generateMatrix(text_string):
  final = []
  tok = nlp(text_string)
  for i in tok.sents:
    vector = generateVec(i.text)
    if vector != None:
      final.append(generateVec(i.text))
  return final

In [60]:
def make_df(intext):
  visualization = pd.DataFrame(generateMatrix(intext))
  keepWords = Condition + Generalization + Modality + Numeric_quantifier + ["Category", "BT Coeff"]
  visualization.columns = keepWords
  return visualization

## Generating Outputs

In [61]:
coefs = []
for i in Policies.iterrows():
  # print(i[1][0])
  temp = i[1][0][:-4]
  temp = "/content/output/" + temp + ".csv"
  try:
    temp_df = make_df(i[1][1])
    temp_df.to_csv(temp)
    coefs.append(temp_df["BT Coeff"].mean())
  except:
    coefs.append("NA")

Policies["vague_score"] = coefs

In [62]:
print("CSVs GENERATED")

CSVs GENERATED


In [63]:
# !zip -r result.zip output

# Manual Testing

In [68]:
in_text = "my name may be rishabh khanna as needed or as applicable from TIME to time everyone. My name depending on the context is generAlly confusing. My name is Rishabh Khanna."
temp = 0
for i in make_df(in_text)["BT Coeff"]:
  temp += i
temp /= len(make_df(in_text)["BT Coeff"])
print(temp)

2.2350000000000003


In [65]:
Policies.to_csv("Policies.csv")

In [66]:
for num, i in enumerate(Policies["File_name"]):
  print(i,": ", Policies["vague_score"][num])

Policies["vague_score"].describe()

Costco.txt :  2.773666666666669


count    1.000000
mean     2.773667
std           NaN
min      2.773667
25%      2.773667
50%      2.773667
75%      2.773667
max      2.773667
Name: vague_score, dtype: float64