In [1]:
import spacy
import os
import pandas as pd
from spacy.matcher import Matcher
from google.colab import files

In [2]:
Condition = ['depending', 'necessary', 'appropriate',
                 'inappropriate', 'as needed', 'as applicable',
                 'otherwise reasonably', 'sometimes',
                 'from time to time']
Generalization = ['generally', 'mostly', 'widely',
                  'general', 'commonly',
                  'usually', 'normally', 'typically',
                  'largely', 'often', 'primarily',
                  'among other things']
Modality = ['may', 'might', 'can', 'could', 'would',
            'likely', 'possible', 'possibly']
Numeric_quantifier = ['anyone', 'certain', 'everyone',
                      'numerous', 'some', 'most', 'few',
                      'much', 'many', 'various',
                      'including but not limited to']

In [3]:
Path = "/content/Privacy Policy/"
filelist = os.listdir(Path)
txt_files = []
file_names = []
for i in filelist:
    if i.endswith(".txt"):
        file_names.append(i)
        with open(Path + i, 'r') as f:
            txt_files.append(f.read())

## Policy String

In [4]:
Policies = pd.DataFrame([file_names, txt_files]).T
Policies.columns = ["File_name", "text_content"]

In [5]:
Policies

Unnamed: 0,File_name,text_content
0,CrayonIconPack.txt,LineX Privacy PolicySkip to main contentJND AP...
1,SimpleGalleryPro:Video&PhotoManager&Editor.txt,Privacy policyGeneralSimple Gallery is develop...
2,BYJU'Sâ€“TheLearningApp.txt,Terms and Conditions of BYJU'S - The Learning ...
3,SAXVideoPlayer-AllinoneHdFormatpro2021.txt,HomeSearch this sitePrivacy PolicyProtecting y...
4,"ShareKaro-Share&FileTransferApp,Shareit.txt",Privacy PolicyPrivacy PolicyLast Updated: [29/...
...,...,...
72,QuickHealTotalSecurity.txt,PRIVACY POLICY- QUICK HEAL MOBILE DEVICE SECUR...
73,"MVBitmaster,MVmastervideostatusmaker-MVBit.txt",MvBitUSER AGREEMENT AND PRIVACY POLICY1. CONSE...
74,KWGTKustomWidgetProKey.txt,Privacy Policy | Kustom Heavy IndustriesAre yo...
75,"AmazonShopping,UPI,MoneyTransfer,BillPayment.txt",Amazon.in Help: Amazon.in Privacy NoticeSkip t...


In [6]:
# with open("/content/Telegram.txt", 'r') as f:
#   text = f.read()
# print(text)

## Index Tracking


In [7]:
def len_str(x):
  return len(x)

In [8]:
keepWords = Condition + Generalization + Modality + Numeric_quantifier
# print(keepWords)

index_tracker = {}
for num, i in enumerate(keepWords):
  index_tracker[i] = num
_ = keepWords.sort(key=len_str, reverse=True)

## Matcher

In [9]:
nlp = spacy.load('en')

matcher = Matcher(nlp.vocab)
for i in keepWords:
  rule = [ {"LOWER": j } for j in i.split() ]
  matcher.add(i, None, rule)

## Functions

In [10]:
def generateVec(sentence):
  text = nlp(sentence)

  final = []
  for i in range(len(keepWords)):
    final.append(0)

  buffer_start = -1
  for word, match_start, match_end in matcher(text):
    if buffer_start < match_start:
      # print(nlp.vocab.strings[word])
      final[index_tracker[nlp.vocab.strings[word]]] += 1
    buffer_start = match_end - 1

  if buffer_start==-1:
    return None

  return final

In [11]:
def generateMatrix(text_string):
  final = []
  tok = nlp(text_string)
  for i in tok.sents:
    vector = generateVec(i.text)
    if vector != None:
      final.append(generateVec(i.text))
  return final

In [12]:
def make_df(intext):
  visualization = pd.DataFrame(generateMatrix(intext))
  keepWords = Condition + Generalization + Modality + Numeric_quantifier
  visualization.columns = keepWords
  return visualization

## Generating Outputs

In [13]:
for i in Policies.iterrows():
  # print(i[1][0])
  temp = i[1][0][:-4]
  temp = "/content/output/" + temp + ".csv"
  try:
    make_df(i[1][1]).to_csv(temp)
  except:
    pass

In [14]:
print("CSVs GENERATED")

CSVs GENERATED


In [15]:
!zip -r result.zip output

  adding: output/ (stored 0%)
  adding: output/Snapchat.csv (deflated 92%)
  adding: output/JioSaavnMusic&Radioâ€“JioTunes,Podcasts,Songs.csv (deflated 92%)
  adding: output/PLAYit-ANewAll-in-OneVideoPlayer.csv (deflated 78%)
  adding: output/SAXVideoPlayer-AllinoneHdFormatpro2021.csv (deflated 50%)
  adding: output/InternetOptimizerPro|No-Ads.csv (deflated 80%)
  adding: output/GameBoosterVIP-FreeFireGFX-LagFix.csv (deflated 65%)
  adding: output/GaanaMusicHindiSongFreeTamilTeluguMP3App.csv (deflated 89%)
  adding: output/FacebookLite.csv (deflated 90%)
  adding: output/SnackyTakatak.csv (deflated 84%)
  adding: output/AJIOOnlineShopping-HandpickedCuratedFashion.csv (deflated 89%)
  adding: output/FlipkartOnlineShoppingApp.csv (deflated 87%)
  adding: output/WhatsAppBusiness.csv (deflated 89%)
  adding: output/MyJio:ForEverythingJio.csv (deflated 87%)
  adding: output/QuickHealTotalSecurity.csv (deflated 84%)
  adding: output/TorquePro(OBD2&Car).csv (deflated 60%)
  adding: output/Fac

# Manual Testing

In [16]:
in_text = "my name may be rishabh khanna may may from TIME to time everyone. My name depending on the context is generAlly confusing. My name is Rishabh Khanna."
make_df(in_text)

Unnamed: 0,depending,necessary,appropriate,inappropriate,as needed,as applicable,otherwise reasonably,sometimes,from time to time,generally,mostly,widely,general,commonly,usually,normally,typically,largely,often,primarily,among other things,may,might,can,could,would,likely,possible,possibly,anyone,certain,everyone,numerous,some,most,few,much,many,various,including but not limited to
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
