Skip to content
This repository has been archived by the owner on May 14, 2023. It is now read-only.

Commit

Permalink
summarize: Introduce Sentence and Word
Browse files Browse the repository at this point in the history
These structs replace the old maps, while preserving more information:
we used to keep a *set* of words; now we're tracking every word.
  • Loading branch information
jdkato committed Apr 8, 2017
1 parent ffb5133 commit 8cbd9a0
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 18 deletions.
47 changes: 31 additions & 16 deletions summarize/summarize.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,19 @@ import (
"github.com/jdkato/prose/tokenize"
)

// A Word represents a single word in a Document.
type Word struct {
Text string // the actual text
Syllables int // the number of syllables
}

// A Sentence represents a single sentence in a Document.
type Sentence struct {
Text string // the actual text
Length int // the number of words
Words []Word // the words in this sentence
}

// A Document represents a collection of text to be analyzed.
//
// A Document's calculations depend on its word and sentence tokenizers. You
Expand All @@ -24,15 +37,15 @@ import (
// content of a Document (e.g., we should be able to build it incrementally).
// Perhaps we should look into using a rope as our underlying data structure?
type Document struct {
Content string // Actual text
NumCharacters float64 // Number of Characters
NumComplexWords float64 // PolysylWords without common suffixes
NumPolysylWords float64 // Number of words with > 2 syllables
NumSentences float64 // Number of sentences
NumSyllables float64 // Number of syllables
NumWords float64 // Number of words
Sentences map[string]int // {sentence: length}
Words map[string][]int // {word: [frequency, syllables]}
Content string // Actual text
NumCharacters float64 // Number of Characters
NumComplexWords float64 // PolysylWords without common suffixes
NumPolysylWords float64 // Number of words with > 2 syllables
NumSentences float64 // Number of sentences
NumSyllables float64 // Number of syllables
NumWords float64 // Number of words
Sentences []Sentence // the Document's sentences
WordFrequency map[string]int // [word]frequency

SentenceTokenizer tokenize.ProseTokenizer
WordTokenizer tokenize.ProseTokenizer
Expand Down Expand Up @@ -65,19 +78,20 @@ func NewDocument(text string) *Document {
// Initialize calculates the data necessary for computing readability and usage
// statistics.
func (d *Document) Initialize() {
d.Words = make(map[string][]int)
d.Sentences = make(map[string]int)
d.WordFrequency = make(map[string]int)
for _, s := range d.SentenceTokenizer.Tokenize(d.Content) {
wordCount := d.NumWords
d.NumSentences++
words := []Word{}
for _, word := range d.WordTokenizer.Tokenize(s) {
d.NumCharacters += countChars(word)
syllables := Syllables(word)
if _, found := d.Words[word]; found {
d.Words[word][0]++
if _, found := d.WordFrequency[word]; found {
d.WordFrequency[word]++
} else {
d.Words[word] = []int{1, syllables}
d.WordFrequency[word] = 1
}
syllables := Syllables(word)
words = append(words, Word{Text: word, Syllables: syllables})
d.NumSyllables += float64(syllables)
if syllables > 2 {
d.NumPolysylWords++
Expand All @@ -87,7 +101,8 @@ func (d *Document) Initialize() {
}
d.NumWords++
}
d.Sentences[s] = int(d.NumWords - wordCount)
d.Sentences = append(d.Sentences, Sentence{
Text: s, Length: int(d.NumWords - wordCount), Words: words})
}
}

Expand Down
4 changes: 2 additions & 2 deletions summarize/usage.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ package summarize
// WordDensity returns a map of each word and its density.
func (d *Document) WordDensity() map[string]float64 {
density := make(map[string]float64)
for word, stats := range d.Words {
density[word] = float64(stats[0]) / d.NumWords
for word, freq := range d.WordFrequency {
density[word] = float64(freq) / d.NumWords
}
return density
}
Expand Down

0 comments on commit 8cbd9a0

Please sign in to comment.