# Bootstrapping Corpora

The corpus generating functionality in words.ipynb allows for corpora with various characteristics to be created.  However, we want to be able randomly generate corpora based on trial i.d. for bootstrapping purposes.

In [3]:
import pandas as pd
import matplotlib as plt
from collections import defaultdict
import random
import nlp_tools
import spacy,operator
nlp=spacy.load('en')
%matplotlib inline

In [4]:
worddatafile="../../voa/OBV2/obv_words_v2_28-01-2017.tsv"
trialdatafile="../../voa/OBV2/obv_defendants_trials.tsv"

worddata=pd.DataFrame.from_csv(worddatafile,sep='\t')
trialdata=pd.DataFrame.from_csv(trialdatafile,sep='\t')

  if self.run_code(code, result):


In [5]:
def make_countdict(alldata):
    countdict={}
    blacklist=['words','obc_hiscoCode']

    for heading in alldata.columns:
        #print('Generating counts for ' +heading)
        if heading not in blacklist:
            countdict[heading]=defaultdict(int)
            selection = alldata[heading]
            for item in selection:
                #print(item)
                countdict[heading][item]+=1
        else:
            #print('skipping')
            pass

    return countdict

def validated(reqlist,valuedata):
    
    reqdict={}
    for (field,value) in reqlist:
        
        parts=field.split(':')
        if len(parts)==1:
            if field in valuedata.keys():
                if isinstance(value, list):
                    ok=[]
                    for v in value:
                        if v in valuedata[field].keys():
                            ok.append(v)
                    if len(ok)>0:
                        reqdict[field]=ok
                elif value in valuedata[field].keys():
                    reqdict[field]=value
                
            
        else:
            if (parts[1]=="max" or parts[1]=="min") and parts[0] in valuedata.keys():
                
                if isinstance(value,list):
                   print("Error: min and max cannot be list")
                
                elif value in valuedata[parts[0]].keys() and isinstance(value,int):
                    reqdict[field]=value
            
    return reqdict


def find_trials(worddf,trialdf,reqlist,join='obo_trial'):
    
    trialreqdict=validated(reqlist,make_countdict(trialdf))
    wordsreqdict=validated(reqlist,make_countdict(worddf))
    
    print(trialreqdict)
    print(wordsreqdict)
    ok=True
    for (req,_value) in reqlist:
        if req in trialreqdict.keys() or req in wordsreqdict.keys():
            pass
        else:
            print("Requirement {} not satisfied".format(req))
            ok=False
     
    if not ok:
        return None
        
    
    trials=trialdf
    for req in trialreqdict.keys():
        parts=req.split(':')
        value =trialreqdict[req]
        if len(parts)>1:
            if parts[1]=='max':
                trials=trials[trials[parts[0]]<=value]
            elif parts[1]=='min':
                trials=trials[trials[parts[0]]>=value]
        elif isinstance(value,list):
            trials=trials[trials[req].isin(value)]                  
        else:                      
            trials=trials[trials[req]==value]
            
            
    selection=[line for line in trials[join]]
    return selection



In [6]:
def bootstrap1(wdf,tdf,reqs):
    trials=find_trials(wdf,tdf,reqs)
    #print(len(trials),trials)
    c=bootstrap_corpus(wdf,trials,reqs)
    print(c)
    

def bootstrap_corpus(worddata,trials,reqs):
    N=len(trials)
    corpus=[]
    #N=1
    allreqdict=validated(reqs,make_countdict(worddata))
    for i in range(0,N):
        atrial=random.choice(trials)
        wdf=worddata[worddata['obo_trial']==atrial]
        for req in allreqdict.keys():
            parts=req.split(':')
            value=allreqdict[req]
            if len(parts)>1:
                if parts[1]=='max':
                    wdf=wdf[wdf[parts[0]]<=value]
                elif parts[1]=='min':
                    wdf=wdf[wdf[parts[0]]>=value]
            elif isinstance(value,list):
                wdf=wdf[wdf[req].isin(value)]
            else:
                wdf=wdf[wdf[req]==value]
            
            
          
        corpus+=[line for line in wdf['words']]
    return corpus
        
 
    
    
    

In [7]:
allreqlist=[('deft_offcat','theft'),('year:min',1800),('year:max',1820),('obv_role',['def','wv'])]

bootstrap1(worddata,trialdata,allreqlist)

{'deft_offcat': 'theft', 'year:min': 1800, 'year:max': 1820}
{'year:min': 1800, 'year:max': 1820, 'obv_role': ['def', 'wv']}


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [8]:
#For a given set of corpora, find the frequency distribution of the k highest frequency words
#Output total size of corpus and sorted list of term, frequency pairs

def find_hfw_dist(corpora,k=100000):
    #add worddicts for individual corpora
    #sort and output highest frequency words
    #visualise
    
    sumdict={}
    corpussize=0
    for acorpus in corpora:
        for(key,value) in acorpus.allworddict.items():
            sumdict[key.lower()]=sumdict.get(key.lower(),0)+value
            corpussize+=value
      
    print("Size of corpus is {}".format(corpussize))
    candidates=sorted(sumdict.items(),key=operator.itemgetter(1),reverse=True)
    #print(candidates[:50])
    #print(len(sumdict))
    #print(sumdict)
    return corpussize,candidates[:k]



In [9]:
def compare(corpusA,corpusB,indicatordict):
    sizeA,hfwA=find_hfw_dist([corpusA])
    sizeB=corpusB.wordtotal
    
    for (word,freqA) in hfwA:
        freqB=corpusB.allworddict.get(word,0)
        probA=freqA/sizeA
        probB=freqB/sizeB
        if probA>probB:
            indicatordict[word]=indicatordict.get(word,0)+1
    return indicatordict
        
def bootstrap_compare(corpusAreqs,allreqs=allreqlist,worddata=worddata,trialdata=trialdata,repeats=10,prop=100):
    print("Finding trials to meet requirements")
    trialsB=find_trials(worddata,trialdata,allreqs)
    print(len(trialsB))
    trialsA=find_trials(worddata,trialdata,allreqs+corpusAreqs)
    print(len(trialsA))
    indicatordict={}
    for i in range(0,repeats):
        print("Bootstrapping corpusB repetition {}".format(i))
        corpB=bootstrap_corpus(worddata,trialsB,allreqs)
        print("Analysing corpus")
        corpusB=nlp_tools.corpus(corpB,nlp,prop=prop,ner=False,loadfiles=False)
        for j in range(0,repeats):
            print("Bootstrapping corpusA repetition {}".format(j))
            corpA=bootstrap_corpus(worddata,trialsA,allreqs+corpusAreqs)
            print("Analysing corpus")
            corpusA=nlp_tools.corpus(corpA,nlp,prop=prop,ner=False,loadfiles=False)
            print("Comparing corpora")
            indicatordict=compare(corpusA,corpusB,indicatordict)

    print("Generating candidates")
    N=repeats*repeats
    candidates=[(term,(value+1)/(N+1)) for (term,value) in indicatordict.items()]
    sortedlist=sorted(candidates,key=operator.itemgetter(1),reverse=True)
    return sortedlist

In [10]:
Areqs=[('obc_sex','f')]
#candidates=bootstrap_compare(Areqs,repeats=10)

In [12]:
#print(candidates[:10])

## Adding a Random Characteristic

Add a column to the words data frame with a random boolean feature which can be used to bootstrap random splits.  This is just for testing purposes - functionality will be added to BootstrapCorpus.py



In [14]:
worddata.head()

Unnamed: 0_level_0,sess_date,year,obo_trial,obo_deftid,obc_u_no,obc_event,obc_speaker,obc_sex,obc_hiscoLabel,obc_hiscoCode,obc_class,obc_role,obv_role,words,obv_words_type,words_count,defendant
obv2wid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,17800112,1780,t17800112-1,,1,17800112-1,17800112-0042,m,Working Proprietor (Guest House),51040.0,higher (1-5),,wv,"I live at No. 7, in Great Suffolk-street, Char...",s,40,MARY DYER
2,17800112,1780,t17800112-1,,2,17800112-2,17800112-?LAW,m,Lawyer,12110.0,higher (1-5),Lawyer,lj,Was that strange woman the prisoner?,q,6,MARY DYER
3,17800112,1780,t17800112-1,,3,17800112-3,17800112-0042,m,Working Proprietor (Guest House),51040.0,higher (1-5),,wv,Yes; she said she only wanted a pint of purl.,a,10,MARY DYER
4,17800112,1780,t17800112-1,,4,17800112-4,17800112-?LAW,m,Lawyer,12110.0,higher (1-5),Lawyer,lj,Do you keep a public-house?,q,5,MARY DYER
5,17800112,1780,t17800112-1,,5,17800112-5,17800112-0042,m,Working Proprietor (Guest House),51040.0,higher (1-5),,wv,"No, a private house. I sent for a constable, a...",a,16,MARY DYER


In [29]:
sLength=len(worddata)
print(sLength)

217376


In [26]:
import numpy as np
t=18
random=pd.Series(['A' if r<t else 'B' for r in 100*np.random.random(sLength)])
print(random[:10])

0    A
1    A
2    B
3    B
4    A
5    B
6    B
7    B
8    A
9    B
dtype: object


In [27]:
worddata=worddata.assign(random=random.values)
worddata.head()

Unnamed: 0_level_0,sess_date,year,obo_trial,obo_deftid,obc_u_no,obc_event,obc_speaker,obc_sex,obc_hiscoLabel,obc_hiscoCode,obc_class,obc_role,obv_role,words,obv_words_type,words_count,defendant,random
obv2wid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,17800112,1780,t17800112-1,,1,17800112-1,17800112-0042,m,Working Proprietor (Guest House),51040.0,higher (1-5),,wv,"I live at No. 7, in Great Suffolk-street, Char...",s,40,MARY DYER,A
2,17800112,1780,t17800112-1,,2,17800112-2,17800112-?LAW,m,Lawyer,12110.0,higher (1-5),Lawyer,lj,Was that strange woman the prisoner?,q,6,MARY DYER,A
3,17800112,1780,t17800112-1,,3,17800112-3,17800112-0042,m,Working Proprietor (Guest House),51040.0,higher (1-5),,wv,Yes; she said she only wanted a pint of purl.,a,10,MARY DYER,B
4,17800112,1780,t17800112-1,,4,17800112-4,17800112-?LAW,m,Lawyer,12110.0,higher (1-5),Lawyer,lj,Do you keep a public-house?,q,5,MARY DYER,B
5,17800112,1780,t17800112-1,,5,17800112-5,17800112-0042,m,Working Proprietor (Guest House),51040.0,higher (1-5),,wv,"No, a private house. I sent for a constable, a...",a,16,MARY DYER,A
