# Create 2D representation of trees based on a given distance

## Database parameters

In [1]:
DB_NAME = 'spreadr_exp_1'
DB_USER = 'spreadr_analysis'

Boilerplate database setup

In [2]:
import os, sys
sys.path.insert(1, os.path.join(os.path.abspath(os.curdir), 'spreadr'))

import django
from django.conf import settings
from spreadr import settings as base_spreadr_settings
spreadr_settings = base_spreadr_settings.__dict__.copy()
spreadr_settings['DATABASES'] = {
    'default': {
        'ENGINE': 'django.db.backends.mysql',
        'NAME': DB_NAME,
        'USER': DB_USER
    }
}
settings.configure(**spreadr_settings)
django.setup()

Imports for the rest of the analysis

In [3]:
from nltk.corpus import stopwords as nltk_stopwords
from nltk.metrics import jaccard_distance, edit_distance
from nltk.stem.snowball import EnglishStemmer as SnowballStemmer
from nltk.tokenize import word_tokenize as nltk_word_tokenize

from gists.models import Sentence, Tree

## 1 Data preprocessing tools

Make it easy to select training/experiment/game sentences and trees

In [4]:
# For sentences
Sentence.objects.__class__.training = property(lambda self: self.get_queryset().filter(bucket__exact='training'))
Sentence.objects.__class__.experiment = property(lambda self: self.get_queryset().filter(bucket__exact='experiment'))
Sentence.objects.__class__.game = property(lambda self: self.get_queryset().filter(bucket__exact='game'))

# Test
assert Sentence.objects.training.count() == 6
assert Sentence.objects.experiment.count() == Sentence.objects.count() - 6 - Sentence.objects.game.count()

# For trees
Tree.objects.__class__.training = property(lambda self: self.get_queryset().filter(root__bucket__exact='training'))
Tree.objects.__class__.experiment = property(lambda self: self.get_queryset().filter(root__bucket__exact='experiment'))
Tree.objects.__class__.game = property(lambda self: self.get_queryset().filter(root__bucket__exact='game'))

# Test
assert Tree.objects.training.count() == 6
assert Tree.objects.experiment.count() == Tree.objects.count() - 6 - Tree.objects.game.count()

For each sentence text, we want the content words. So:
* tokenize
* set to lowercase
* remove punctuation
* remove words $\leq$ 2 characters
* remove stopwords
* stem

and set the result as `content_words` on each `Sentence`.

In [5]:
def _filter(words, exclude_list):
    return filter(lambda w: w not in exclude_list, words)

def filter_punctuation(words):
    return _filter(words, [',', '.', ';', '!', '?'])

stopwords = set(nltk_stopwords.words('english'))
stopwords.add("n't")  # Missing from the corpus, and appears with tokenization

def filter_stopwords(words):
    return _filter(words, stopwords)

def filter_lowercase(words):
    return map(lambda w: w.lower(), words)

def filter_length(words):
    return filter(lambda w: len(w) > 2, words)

stemmer = SnowballStemmer(ignore_stopwords=True)

def filter_stem(words):
    return map(lambda w: stemmer.stem(w), words)

filters = [nltk_word_tokenize,
           filter_lowercase,
           filter_punctuation,
           filter_length,
           filter_stopwords,
           filter_stem]

def get_content_words(self):
    processed = self.text
    for f in filters:
        processed = f(processed)
    return list(processed)

Sentence.content_words = property(get_content_words)

# Test
assert Sentence.objects.get(id=1).content_words == ['young', 'boy', 'sudden', 'hit', 'littl', 'girl']
assert Sentence.objects.get(id=2).content_words == ['forget', 'leav', 'door', 'open', 'leav', 'offic']

Measure distances between sentences

In [6]:
def ordered_distance(self, sentence):
    self_content_words = self.content_words
    sentence_content_words = sentence.content_words
    return edit_distance(self_content_words, sentence_content_words) / \
        max(len(self_content_words), len(sentence_content_words))

def unordered_distance(self, sentence):
    return jaccard_distance(set(self.content_words), set(sentence.content_words))

Sentence.ordered_distance = ordered_distance
Sentence.unordered_distance = unordered_distance

# Testing this is hard (we don't have predictable data for it), so we test values for 0 and 1 only
assert Sentence.objects.get(id=1).ordered_distance(Sentence.objects.get(id=1)) == 0.0
assert Sentence.objects.get(id=1).unordered_distance(Sentence.objects.get(id=1)) == 0.0
assert Sentence.objects.get(id=1).ordered_distance(Sentence.objects.get(id=2)) == 1.0
assert Sentence.objects.get(id=1).unordered_distance(Sentence.objects.get(id=2)) == 1.0