# Characteristic Words

A notebook to explore the distribution of characterising / high frequency / function words in different corpora.

In [1]:
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import time
from collections import defaultdict
import operator,math
from gensim.models import Word2Vec
import math

import nlp_tools

Functions to find the high frequency (ranked higher than k) distribution for a corpus and display the very highest frequency words (ranked higher than cutoff)

In [2]:
#For a given set of corpora, find the frequency distribution of the k highest frequency words
#Output total size of corpus and sorted list of term, frequency pairs

def find_hfw_dist(corpora,k=100000):
    #add worddicts for individual corpora
    #sort and output highest frequency words
    #visualise
    
    sumdict={}
    corpussize=0
    for acorpus in corpora:
        for(key,value) in acorpus.allworddict.items():
            sumdict[key.lower()]=sumdict.get(key.lower(),0)+value
            corpussize+=value
      
    print("Size of corpus is {}".format(corpussize))
    candidates=sorted(sumdict.items(),key=operator.itemgetter(1),reverse=True)
    #print(candidates[:50])
    #print(len(sumdict))
    #print(sumdict)
    return corpussize,candidates[:k]
        
  

In [3]:
#display the most frequent words in 1 or 2 high frequency word distributions

def display(hfw,cutoff=10,words=[],hfb=None,leg=None):
    width=0.7
    corpussize=hfw[0]
    if words==[]:
        todisplay=hfw[1][:cutoff]
    else:
        todisplay=[(x,y) for (x,y) in hfw[1] if x in words]
        cutoff=len(words)
    barvalues=sorted(todisplay,key=operator.itemgetter(0),reverse=False)
    #print(barvalues)
    xs,ys=[*zip(*barvalues)]
    ps=[y/corpussize for y in ys]
    
    toplot=[ps]
    if hfb!=None:
        width=0.35
        corpussizeb=hfb[0]
        if words==[]:
            todisplayb=hfb[1][:cutoff]
        else:
            todisplayb=[(x,y) for (x,y) in hfb[1] if x in words]
        barb=sorted(todisplayb,key=operator.itemgetter(0),reverse=False)
        
        xb,yb=[*zip(*barb)]
        pb=[y/corpussizeb for y in yb]
        
        toplot.append(pb)
        
    
    N=len(xs)
    ind=np.arange(N)
    fig,ax=plt.subplots(figsize=(cutoff,cutoff/4))
    
    rects1=ax.bar(ind,ps,width,color='r')
    if hfb!=None:
        rects2=ax.bar(ind+width,pb,width,color='b')
        if leg==None:
            ax.legend((rects1,rects2),("Dist1","Dist2"))
        else:
            ax.legend((rects1,rects2),leg)
    ax.set_xticks(ind)
    ax.set_xticklabels(xs)
    ax.set_xlabel('High Frequency Words')
    ax.set_ylabel('Probability')
    
    
    return xs
    


# Differences Between Female and Male Non-Legal Speech in 1800-1820

We are interested in the period 1800-1820.  We are interested in non-legal speech (witness and defendants) in 'theft' trials (which make up the bulk of the trials).  These are split into female and male (by speaker)

In [4]:
p=100
female_nonlegal=['corpus_1800_1820_theft_f_def','corpus_1800_1820_theft_f_wv']
male_nonlegal=['corpus_1800_1820_theft_m_def','corpus_1800_1820_theft_m_wv']

female_corpus=nlp_tools.corpus(female_nonlegal,nlp,prop=p,ner=False)
male_corpus=nlp_tools.corpus(male_nonlegal,nlp,prop=p,ner=False)

NameError: name 'nlp' is not defined

Lets look at the top 20 most frequent words across the whole corpus (i.e., males and males taken together).

In [None]:
hfw_theft=find_hfw_dist([male_corpus,female_corpus],k=100000)
function_words_theft=display(hfw_theft,cutoff=20)

Now lets compare the distributions of these words for males and females.

In [None]:
hff_theft=find_hfw_dist([female_corpus])
hfm_theft=find_hfw_dist([male_corpus])
some_words=display(hff_theft,words=function_words_theft,hfb=hfm_theft,leg=['Female speakers','Male speakers'])

There are minimal differences between the distributions. 'of', 'to', 'him','he' and 'the' appear more often for male speakers.  Some punctuation and 'my' appears more often for female speakers.

We want to be able to find the characteristic words for a corpus (i.e., words which occur more often than one would expect by chance).  We can do this using PMI and/or likelihood lift ratio - but it is conventional in work on comparing corpora to use log-likelihood-ratio (LLR (Dunning, 1993)) as this is less biased towards low frequency events than PMI and doesn't require any parameters to be set.

In [None]:
def makedict(alist):
    adict={}
    for (key,value) in alist:
        adict[key]=adict.get(key,0)+value
    return adict
        

def pmi(wordfreq,refwordfreq,corpussize,refcorpussize):
    if wordfreq*refcorpussize*refwordfreq*corpussize==0:
        score=0
#        print(wordfreq,refwordfreq,corpussize,refcorpussize)
    else:
        score=np.log((wordfreq*refcorpussize)/(refwordfreq*corpussize))
    return score


def rev_pmi(wordfreq,refwordfreq,corpussize,refcorpussize):
    return pmi(refwordfreq-wordfreq, refwordfreq,refcorpussize-corpussize,refcorpussize)
        
def llr(wordfreq,refwordfreq,corpussize,refcorpussize):
    #print(wordfreq,refwordfreq,corpussize,refcorpussize)
    mypmi=pmi(wordfreq,refwordfreq,corpussize,refcorpussize)
    myrevpmi=rev_pmi(wordfreq,refwordfreq,corpussize,refcorpussize)
    #myrevpmi2=rev_pmi2(wordfreq,refwordfreq,corpussize,refcorpussize)
    #print(mypmi,myrevpmi,myrevpmi2)
    llr_score=2*(wordfreq*mypmi+(refwordfreq-wordfreq)*myrevpmi)
    if pmi(wordfreq,refwordfreq,corpussize,refcorpussize)<0:
        return -llr_score
    else:
        return llr_score
    
def likelihoodlift(wordfreq,refwordfreq,corpussize,refcorpussize,alpha):
    beta=0
    if alpha==1:
        return math.log(wordfreq/corpussize)
    elif alpha==0:
        return pmi(wordfreq,refwordfreq,corpussize,refcorpussize)
    else:
        return(alpha*math.log(beta+(wordfreq/corpussize))+(1-alpha)*pmi(wordfreq,refwordfreq,corpussize,refcorpussize))

def mysurprise(wf,rwf,cs,rcs,measure,params):
    if measure=='pmi':
        return pmi(wf,rwf,cs,rcs)
    elif measure=='llr':
        return llr(wf,rwf,cs,rcs)
    elif measure =='likelihoodlift':
        return likelihoodlift(wf,rwf,cs,rcs,params.get('alpha',0.5))
    else:
        print("Unknown measure of surprise")

def improved_compute_surprises(corpusA,corpusB,measure,params={}):
    (corpusAsize,wordlistA)=corpusA
    (corpusBsize,wordlistB)=corpusB
    if 'threshold' in params.keys():
        threshold=params['threshold']
    else:
        threshold=len(wordlistA)
    #dictA=makedict(wordlistA)
    dictB=makedict(wordlistB)
    
    scores=[]
   # print(wordlistA[:threshold])
    for(term,freq) in wordlistA[:threshold]:
        scores.append((term,mysurprise(freq,dictB.get(term,freq+1),corpusAsize,corpusBsize,measure,params)))
    sortedscores=sorted(scores,key=operator.itemgetter(1),reverse=True)
    k=20
    print("Top {} terms are ".format(k))
    print(sortedscores[:k])
    rank=0
    if measure=="llr":
        for (term,score) in sortedscores:
            if score>10.828:
                rank+=1
            else:
                break
        print("{} significantly characterising terms".format(rank))
    return(sortedscores[:rank])

We can use the LLR to compute the most characteristic words for males and for females.  Any with a score over 10.828 are statistically significant at the 0.1% level.  We display the top 20.

In [None]:
malewords=improved_compute_surprises(hfm_theft,hfw_theft,'llr')

In [None]:
femalewords=improved_compute_surprises(hff_theft,hfw_theft,'llr',params={})

For comparison, we can use the likelihood-lift ratio - however now we need to set alpha to decide the weight of likelihood and lift respectively.

In [None]:
improved_compute_surprises(hfm_theft,hfw_theft,'likelihoodlift',params={'alpha':0.01})

In [None]:
improved_compute_surprises(hff_theft,hfw_theft,'likelihoodlift',params={'alpha':0.1})


## Different Periods Corresponding to Different Scribes


We also have 3 subperiods within the 1800-1820 period which correspond to 3 different scribes / publishers.

In [None]:
p1_nonlegal=['corpus_theft_def_1800_1805','corpus_theft_wv_1800_1805']
p2_nonlegal=['corpus_theft_def_1806_1815','corpus_theft_wv_1806_1815']
p3_nonlegal=['corpus_theft_def_1816_1820','corpus_theft_wv_1816_1820']

p1_corpus=nlp_tools.corpus(p1_nonlegal,nlp,prop=p,ner=False)
p2_corpus=nlp_tools.corpus(p2_nonlegal,nlp,prop=p,ner=False)
p3_corpus=nlp_tools.corpus(p3_nonlegal,nlp,prop=p,ner=False)



In [None]:
theft_corpus=nlp_tools.corpus(p1_nonlegal+p2_nonlegal+p3_nonlegal,nlp,prop=p,ner=False)

In [None]:
def autolabel(rects,ax):
    """
    Attach a text label above each bar displaying its height
    """
    
    for rect in rects:
        height = rect.get_height()

        ax.text(rect.get_x() + rect.get_width()/2., height*1.1,
                '%1.1f' % height,
                ha='center', va='bottom')

def display_list(hfw_list,cutoff=10,words=[],leg=None,title=None):
    width=0.7/len(hfw_list)
    toplot=[]
    for hfw in hfw_list:
        corpussize=hfw[0]
        if words==[]:
            todisplay=hfw[1][:cutoff]
        else:
            todisplay=[(x,y) for (x,y) in hfw[1] if x in words]
            cutoff=len(words)
        barvalues=sorted(todisplay,key=operator.itemgetter(0),reverse=False)
        #print(barvalues)
        xs,ys=[*zip(*barvalues)]
        ps=[y*100/corpussize for y in ys]
    
        toplot.append(ps)
        
    N=cutoff
    ind=np.arange(N)
    fig,ax=plt.subplots(figsize=(2*cutoff,cutoff/2))
    rectset=[]
    colors=['r','b','y','g']
    for i,ps in enumerate(toplot):
        rectset.append(ax.bar(ind+i*width,ps,width,color=colors[i]))
    
    if leg!=None:
        ax.legend(rectset,leg)
    ax.set_xticks(ind)
    ax.set_xticklabels(xs)
    ax.set_xlabel('High Frequency Words')
    ax.set_ylabel('Probability')
    ax.set_ylim(0,10)
    for rects in rectset:
        autolabel(rects,ax)
    if title!=None:
        ax.set_title(title)
    
    return xs
    


We can look at the high frequency words across the whole corpus (which approximates the male+female combination in the last section - this is slightly larger, due presumably to some utterances not being labelled as either female or male).

In [None]:
k=100000
hfperiods_theft=find_hfw_dist([p1_corpus,p2_corpus,p3_corpus],k=k)
function_words_theft=display_list([hfperiods_theft],cutoff=20)

In [None]:

hfp1_theft=find_hfw_dist([p1_corpus],k=k)
hfp2_theft=find_hfw_dist([p2_corpus],k=k)
hfp3_theft=find_hfw_dist([p3_corpus],k=k)
some_words=display_list([hfp1_theft,hfp2_theft,hfp3_theft],words=function_words_theft,leg=['1800-1805','1806-1815','1816-1820'],title="High Frequency Word Distributions for Non_legal Speakers in Theft Cases Split by Transcription Period")

Comparing the different corpora, we see that there is a much greater use of ; in the first period and correspondingly, a greater use of . in the later periods.  We also see an increasing use of 'the' throughout the 3 periods and a corresponding decreasing use of 'a'.

In [None]:
p1words=improved_compute_surprises(hfp1_theft,hfperiods_theft,'llr')

In [None]:
p2words=improved_compute_surprises(hfp2_theft,hfperiods_theft,'llr')

In [None]:
p3words=improved_compute_surprises(hfp3_theft,hfperiods_theft,'llr')

## Analysis of Characterising Terms

Having derived a set of characterising terms for a subcorpus, we want to analyse them.  Basic analysis includes average frequency and POS tag distribution.  Further analysis includes clustering semantically.



In [None]:
def autolabel(rects,ax):
    """
    Attach a text label above each bar displaying its height
    """
    
    for rect in rects:
        height = rect.get_height()

        ax.text(rect.get_x() + rect.get_width()/2., height*1.1,
                '%1.1f' % height,
                ha='center', va='bottom')
        
def improved_display_list(xvalues,yvalueslist,labels={}):
    width=0.7/len(yvalueslist)      
    N=len(xvalues)
    ind=np.arange(N)
    fig,ax=plt.subplots(figsize=(20,12))
    rectset=[]
    colors=['r','b','y','g']
    for i,ps in enumerate(yvalueslist):
        rectset.append(ax.bar(ind+i*width,ps,width,color=colors[i]))
    
    leg=labels.get('leg',None)
    title=labels.get('title',None)
    xlabel=labels.get('xlabel','Year')
    ylabel=labels.get('ylabel','Probability')
    ylim=labels.get('ylim',1)
    if leg!=None:
        ax.legend(rectset,leg)
    ax.set_xticks(ind)
    ax.set_xticklabels(xvalues)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_ylim(0,ylim)
    for rects in rectset:
        autolabel(rects,ax)
    if title!=None:
        ax.set_title(title)
    plt.show()

In [None]:
#We have a corpus e.g., male_corpus and a set of characterising terms for that corpus e.g., malewords
def find_pos(term,corpus):
    pospos=['NOUN','VERB','ADJ','ADV','PUNCT']
    counts={}
    for apos in pospos:
        counts[apos]=corpus.wordposdict.get((term,apos),0)
    
    total=sum(counts.values())
    
    gt=corpus.allworddict.get(term,0)
    counts['OTHER']=gt-total
    #print(term,gt,counts)
    if gt>0:
        poses=[(tag,weight/gt) for (tag,weight) in counts.items()]
    else:
        poses=[]
    #print(term,poses)
    return poses
    
def analyse(termset,corpus):
    freqs=[]
    somefreqs=[]
    posdict={}
    someposdict={}
    threshold=20
    for i,(term,relevance) in enumerate(termset):
        freq=corpus.allworddict[term]
        freqs.append(freq)
        if i<threshold:
            somefreqs.append(freq)
        poses=find_pos(term,corpus)
        for mypos,weight in poses:
            posdict[mypos]=posdict.get(mypos,0)+weight
            if i<threshold:
                someposdict[mypos]=someposdict.get(mypos,0)+weight
            
        
    freqarray=np.array(freqs)    
    meanfreq=np.mean(freqarray)
    sdfreq=np.std(freqarray)
    meanprob=meanfreq/corpus.wordtotal
    sdprob=sdfreq/corpus.wordtotal
    print("Mean frequency is {}, sd is {}".format(meanfreq,sdfreq))
    print("Mean probability is {}, sd is {}".format(meanprob,sdprob))
    somefreqarray=np.array(somefreqs)
    meansomefreq=np.mean(somefreqarray)
    sdsomefreq=np.std(somefreqarray)
    meansomeprob=meansomefreq/corpus.wordtotal
    sdsomeprob=sdsomefreq/corpus.wordtotal
    print("For top {} words, mean freq is {}, sd is {}".format(threshold,meansomefreq,sdsomefreq))
    print("For top {} words, mean prob is {}, sd is {}".format(threshold,meansomeprob,sdsomeprob))
    #print(posdict)
    xvalues=posdict.keys()
    totaly=sum(posdict.values())
    totalz=sum(someposdict.values())
    allvalues=[]
    somevalues=[]
    for x in xvalues:
        allvalues.append(posdict.get(x,0))
        somevalues.append(someposdict.get(x,0))
    yvalues=[[100*y/totaly for y in allvalues],[100*z/totalz for z in somevalues]]
    labels={'title':'Distribution of POS in Characterising Terms','xlabel':'Part of Speech','ylabel':'Proportion','leg':['Whole Set',"Top {}-restricted Set".format(threshold)],'ylim':100}
    improved_display_list(xvalues,yvalues,labels)

In [None]:
analyse(malewords,theft_corpus)

In [None]:
analyse(femalewords,theft_corpus)

In [None]:
analyse(p1words,theft_corpus)

In [None]:
analyse(p2words,theft_corpus)

In [None]:
analyse(p3words,theft_corpus)

We can see that a much higher percentage of the characteristic words in the period corpora are punctuation and function words.  In the male/female splits, a higher percentage of the characteristic words are content words.  Looking at the most characterising words, it seems that there may be more characteristic nouns for females and more characteristic verbs for males but this would need further exploration.

## Word2Vec clustering


In [None]:
#from gensim.models import Word2Vec

#parameters
size=300
min_count=10
window=1
sg=1 #1=use skip-gram, otherwise=cbow

In [None]:
start=time.time()
theft_model=Word2Vec(theft_corpus.sentences,min_count=min_count,window=window,size=size,sg=sg,workers=4)
vocab=list(theft_model.wv.vocab.keys())
print("Word2Vec model built with example vocabulary:")
print(str(vocab[0:10]))
end=time.time()
print("Time taken: {}s".format(str(end-start)))

In [None]:
theft_model.wv.most_similar(['.'])

In [None]:
theft_model.wv.most_similar(['prisoner'])

In [None]:
theft_model.wv.most_similar(['warehouse'])

In [None]:
theft_model.wv.most_similar(['husband'])

In [None]:
def nearest_neighbours(wordset,w2vmodel):
    threshold=20
    found=0
    for i,(term,score) in enumerate(wordset):
        try:
            neighbours=w2vmodel.wv.most_similar([term])
            found+=1
            if i<threshold:
                print(term,neighbours)
        except:
            print("{} not in vocab".format(term))
    
    oov=100-(found*100/len(wordset))
    print("Out of vocabulary: {}".format(oov))
    
    

In [None]:
nearest_neighbours(femalewords,theft_model)

In [None]:
nearest_neighbours(malewords,theft_model)

In [None]:
nearest_neighbours(p1words,theft_model)

In [None]:
nearest_neighbours(p2words,theft_model)

In [None]:
nearest_neighbours(p3words,theft_model)

In [None]:
theft_model.wv.similarity('man','woman')

In [None]:
def make_matrix(wordset,model,threshold=0.5):
    
    matrix=[]
    
    for (termA,_score) in wordset:
        row=[]
        for(termB,_score) in wordset:
            try:
                sim=model.wv.similarity(termA,termB)
                if sim<threshold:
                    sim=0
            except:
                sim=0
            row.append(sim)
            
        matrix.append(row)
    return matrix

In [None]:
female_matrix=make_matrix(femalewords,theft_model)

In [None]:
import csv

punctdict={"\n":"_NEWLINE",";":"_SEMICOLON",":":"_COLON","\"":"_QUOTE","'s":"_GEN","-":"_HYPHEN","(":"_LEFTBRACKET",")":"_RIGHTBRACKET",",":"_COMMA",".":"_FULLSTOP","..":"_DOTDOT"}

def clean(term):
    
    #remove punctuation which will confuse Gephi
    cleanterm=punctdict.get(term,term)
    return cleanterm

def make_csv(wordset,model,filename):
    matrix=make_matrix(wordset,model)
    terms=[clean(term) for (term,score) in wordset]
    
    #with open(filename,'w') as csvfile:
    #    csvwriter=csv.writer(csvfile,dialect='excel')
    #    headings=['']+terms
        #print(headings)
    #    csvwriter.writerow(headings)
    #    for term,row in zip(terms,matrix):
    #        csvwriter.writerow([term]+row)
     
    with open(filename,'w') as csvfile:
        line=""
        for term in terms:
            line+=';'+term
        line+='\n'
        
        csvfile.write(line)
        #print(line)
        for term,row in zip(terms,matrix):
            line=term
            #print(row)
            for item in row:
                line+=';'+str(item)
                
            line+='\n'
            
            csvfile.write(line)
            #print(line) 
    

In [None]:
make_csv(femalewords,theft_model,'matrix_female.csv')

In [None]:
make_csv(malewords,theft_model,'matrix_male.csv')

In [None]:
make_csv(p1words,theft_model,'matrix_p1.csv')

In [None]:
make_csv(p2words,theft_model,'matrix_p2.csv')

In [None]:
make_csv(p3words,theft_model,'matrix_p3.csv')

In [None]:
print(malewords)

In [None]:
theft_lexicon=makedict(hfperiods_theft[1])
p1theft_lexicon=makedict(hfp1_theft[1])

In [None]:
def lookup(term,d1,d2):
    lex1=makedict(d1[1])
    lex2=makedict(d2[1])
    print('{}:{},{}'.format(term,lex1[term],lex2[term]))

In [None]:
lookup('fork',hfperiods_theft,hfp1_theft)

In [None]:
lookup('drawer',hfw_theft,hff_theft)