# Bootstrapping Corpora

The corpus generating functionality in words.ipynb allows for corpora with various characteristics to be created.  However, we want to be able randomly generate corpora based on trial i.d. for bootstrapping purposes.

In [39]:
import pandas as pd
import matplotlib as plt
from collections import defaultdict
import random
import nlp_tools
import spacy,operator
nlp=spacy.load('en')
%matplotlib inline

In [2]:
worddatafile="../../voa/OBV2/obv_words_v2_28-01-2017.tsv"
trialdatafile="../../voa/OBV2/obv_defendants_trials.tsv"

worddata=pd.DataFrame.from_csv(worddatafile,sep='\t')
trialdata=pd.DataFrame.from_csv(trialdatafile,sep='\t')

  if self.run_code(code, result):


In [58]:
def make_countdict(alldata):
    countdict={}
    blacklist=['words','obc_hiscoCode']

    for heading in alldata.columns:
        #print('Generating counts for ' +heading)
        if heading not in blacklist:
            countdict[heading]=defaultdict(int)
            selection = alldata[heading]
            for item in selection:
                #print(item)
                countdict[heading][item]+=1
        else:
            #print('skipping')
            pass

    return countdict

def validated(reqlist,valuedata):
    
    reqdict={}
    for (field,value) in reqlist:
        
        parts=field.split(':')
        if len(parts)==1:
            if field in valuedata.keys():
                if isinstance(value, list):
                    ok=[]
                    for v in value:
                        if v in valuedata[field].keys():
                            ok.append(v)
                    if len(ok)>0:
                        reqdict[field]=ok
                elif value in valuedata[field].keys():
                    reqdict[field]=value
                
            
        else:
            if (parts[1]=="max" or parts[1]=="min") and parts[0] in valuedata.keys():
                
                if isinstance(value,list):
                   print("Error: min and max cannot be list")
                
                elif value in valuedata[parts[0]].keys() and isinstance(value,int):
                    reqdict[field]=value
            
    return reqdict


def find_trials(worddf,trialdf,reqlist,join='obo_trial'):
    
    trialreqdict=validated(reqlist,make_countdict(trialdf))
    wordsreqdict=validated(reqlist,make_countdict(worddf))
    
    print(trialreqdict)
    print(wordsreqdict)
    ok=True
    for (req,_value) in reqlist:
        if req in trialreqdict.keys() or req in wordsreqdict.keys():
            pass
        else:
            print("Requirement {} not satisfied".format(req))
            ok=False
     
    if not ok:
        return None
        
    
    trials=trialdf
    for req in trialreqdict.keys():
        parts=req.split(':')
        value =trialreqdict[req]
        if len(parts)>1:
            if parts[1]=='max':
                trials=trials[trials[parts[0]]<=value]
            elif parts[1]=='min':
                trials=trials[trials[parts[0]]>=value]
        elif isinstance(value,list):
            trials=trials[trials[req].isin(value)]                  
        else:                      
            trials=trials[trials[req]==value]
            
            
    selection=[line for line in trials[join]]
    return selection



In [66]:
def bootstrap1(wdf,tdf,reqs):
    trials=find_trials(wdf,tdf,reqs)
    #print(len(trials),trials)
    c=bootstrap_corpus(wdf,trials,reqs)
    print(c)
    

def bootstrap_corpus(worddata,trials,reqs):
    N=len(trials)
    corpus=[]
    #N=1
    allreqdict=validated(reqs,make_countdict(worddata))
    for i in range(0,N):
        atrial=random.choice(trials)
        wdf=worddata[worddata['obo_trial']==atrial]
        for req in allreqdict.keys():
            parts=req.split(':')
            value=allreqdict[req]
            if len(parts)>1:
                if parts[1]=='max':
                    wdf=wdf[wdf[parts[0]]<=value]
                elif parts[1]=='min':
                    wdf=wdf[wdf[parts[0]]>=value]
            elif isinstance(value,list):
                wdf=wdf[wdf[req].isin(value)]
            else:
                wdf=wdf[wdf[req]==value]
            
            
          
        corpus+=[line for line in wdf['words']]
    return corpus
        
 
    
    
    

In [61]:
allreqlist=[('deft_offcat','theft'),('year:min',1800),('year:max',1820),('obv_role',['def','wv'])]

bootstrap1(worddata,trialdata,allreqlist)

{'year:min': 1800, 'year:max': 1820, 'obv_role': ['def', 'wv']}
{'deft_offcat': 'theft', 'year:min': 1800, 'year:max': 1820}
['I live at Witton , a little village in the parish of Twickenham.', "No, I went out at five o'clock in the morning. I left my wife and two children in the house. My wife returned home first.", "On the 10th of August, I went out about ten o'clock. I locked the door, and took the key with me. I left nobody in the house. I fastened the window with a wooden-pin. I returned home about half after three in the afternoon, I found the window broken, and set wide open. It was a little sash that goes back, sliding in a groove.", 'One of the panes were broken. I secured the window that morning with a wooden-pin.', 'Yes, without the pane had been broken, and they had taken out the peg. The window was on the ground floor. The window was large enough to let a man get in.', "As soon as I got into the door, I found my house robbed, my property was gone, and my box standing open;

In [38]:
#For a given set of corpora, find the frequency distribution of the k highest frequency words
#Output total size of corpus and sorted list of term, frequency pairs

def find_hfw_dist(corpora,k=100000):
    #add worddicts for individual corpora
    #sort and output highest frequency words
    #visualise
    
    sumdict={}
    corpussize=0
    for acorpus in corpora:
        for(key,value) in acorpus.allworddict.items():
            sumdict[key.lower()]=sumdict.get(key.lower(),0)+value
            corpussize+=value
      
    print("Size of corpus is {}".format(corpussize))
    candidates=sorted(sumdict.items(),key=operator.itemgetter(1),reverse=True)
    #print(candidates[:50])
    #print(len(sumdict))
    #print(sumdict)
    return corpussize,candidates[:k]



In [67]:
def compare(corpusA,corpusB,indicatordict):
    sizeA,hfwA=find_hfw_dist([corpusA])
    sizeB=corpusB.wordtotal
    
    for (word,freqA) in hfwA:
        freqB=corpusB.allworddict.get(word,0)
        probA=freqA/sizeA
        probB=freqB/sizeB
        if probA>probB:
            indicatordict[word]=indicatordict.get(word,0)+1
    return indicatordict
        
def bootstrap_compare(corpusAreqs,allreqs=allreqlist,worddata=worddata,trialdata=trialdata,repeats=10,prop=100):
    print("Finding trials to meet requirements")
    trialsB=find_trials(worddata,trialdata,allreqs)
    print(len(trialsB))
    trialsA=find_trials(worddata,trialdata,allreqs+corpusAreqs)
    print(len(trialsA))
    indicatordict={}
    for i in range(0,repeats):
        print("Bootstrapping corpusB repetition {}".format(i))
        corpB=bootstrap_corpus(worddata,trialsB,allreqs)
        print("Analysing corpus")
        corpusB=nlp_tools.corpus(corpB,nlp,prop=prop,ner=False,loadfiles=False)
        for j in range(0,repeats):
            print("Bootstrapping corpusA repetition {}".format(j))
            corpA=bootstrap_corpus(worddata,trialsA,allreqs+corpusAreqs)
            print("Analysing corpus")
            corpusA=nlp_tools.corpus(corpA,nlp,prop=prop,ner=False,loadfiles=False)
            print("Comparing corpora")
            indicatordict=compare(corpusA,corpusB,indicatordict)

    print("Generating candidates")
    N=repeats*repeats
    candidates=[(term,(value+1)/(N+1)) for (term,value) in indicatordict.items()]
    sortedlist=sorted(candidates,key=operator.itemgetter(1),reverse=True)
    return sortedlist

In [68]:
Areqs=[('obc_sex','f')]
candidates=bootstrap_compare(Areqs,repeats=10)

Finding trials to meet requirements
{'year:min': 1800, 'year:max': 1820, 'obv_role': ['def', 'wv']}
{'deft_offcat': 'theft', 'year:min': 1800, 'year:max': 1820}
38952
{'year:min': 1800, 'year:max': 1820, 'obv_role': ['def', 'wv'], 'obc_sex': 'f'}
{'deft_offcat': 'theft', 'year:min': 1800, 'year:max': 1820}
6995
Bootstrapping corpusB repetition 0
Analysing corpus
Running basic analysis
Analysing 100%. Chunks of size 186953
Completed 186953 docs (10.000053489665797% complete)
Completed 373906 docs (20.000106979331594% complete)
Completed 560859 docs (30.00016046899739% complete)


KeyboardInterrupt: 

In [None]:
print(candidates[:10])