Skip to content
This repository has been archived by the owner on Jan 11, 2023. It is now read-only.

Commit

Permalink
Add "htmlclean" using Gumbo parser
Browse files Browse the repository at this point in the history
  • Loading branch information
brendanlong committed Aug 29, 2018
1 parent a311deb commit 17e1813
Show file tree
Hide file tree
Showing 10 changed files with 137 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -21,6 +21,7 @@ For translators : https://hosted.weblate.org/projects/feedreader/
- `pkg-config`
- `libgirepository1.0-dev`
- `libgtk-3-dev (>= 3.22)`
- `libgumbo-dev`
- `libsoup2.4-dev`
- `libjson-glib-dev`
- `libwebkit2gtk-4.0-dev (>=2.20)`
Expand Down
1 change: 1 addition & 0 deletions debian/control
Expand Up @@ -20,6 +20,7 @@ Build-Depends: cmake (>= 2.8),
libgstreamer-plugins-base1.0-dev,
libgoa-1.0-dev,
libcurl-dev,
libgumbo-dev,
libpeas-dev
Standards-Version: 3.9.3

Expand Down
1 change: 1 addition & 0 deletions docker/Dockerfile
Expand Up @@ -8,6 +8,7 @@ RUN dnf -y install \
gstreamer1-devel \
gstreamer1-plugins-base-devel \
gtk3-devel \
gumbo-parser-devel \
json-glib-devel \
libcurl-devel \
libgee-devel \
Expand Down
90 changes: 90 additions & 0 deletions libraries/htmlclean/htmlclean.c
@@ -0,0 +1,90 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jdtang@google.com (Jonathan Tang)
// Minimal changes to turn this into a C library by Brendan Long <self@brendanlong.com>
//
// Gets the cleantext of a page.
// See https://github.com/google/gumbo-parser/blob/master/examples/clean_text.cc
#include <unistd.h>
#include <string.h>

#include <stdio.h>
#include <stdlib.h>

#include "glib.h"
#include "gumbo.h"

char *cleantext(GumboNode *node)
{
if (node->type == GUMBO_NODE_TEXT)
{
char* text = g_strdup(node->v.text.text);
if (text == NULL) {
return NULL;
}
text = g_strstrip(text);
if (strlen(text) == 0) {
g_free(text);
return NULL;
}
return text;
}
else if (node->type == GUMBO_NODE_ELEMENT &&
node->v.element.tag != GUMBO_TAG_SCRIPT &&
node->v.element.tag != GUMBO_TAG_STYLE)
{
GumboVector *children = &node->v.element.children;
char **strs = malloc((children->length + 1) * sizeof(char **));
size_t num_nonempty = 0;
for (unsigned int i = 0; i < children->length; ++i)
{
char *text = cleantext((GumboNode *)children->data[i]);
if (text != NULL)
{
strs[num_nonempty] = text;
++num_nonempty;
}
}
strs[num_nonempty] = NULL;

char *output = g_strjoinv(" ", strs);
for (size_t i = 0; i < num_nonempty; ++i)
{
g_free(strs[i]);
}
free(strs);
return output;
}
else
{
return NULL;
}
}

char *htmlclean_strip_html(char *input)
{
char *cleaned = NULL;
if (input != NULL)
{
GumboOutput *output = gumbo_parse(input);
cleaned = cleantext(output->root);
gumbo_destroy_output(&kGumboDefaultOptions, output);
}
if (cleaned == NULL)
{
return g_strdup("");
}
return cleaned;
}
7 changes: 7 additions & 0 deletions libraries/htmlclean/htmlclean.h
@@ -0,0 +1,7 @@
#pragma once

/**
* Strips HTML from the input string and returns just the text.
* The resulting string must be freed when you're done with it.
*/
char *htmlclean_strip_html(char *);
8 changes: 8 additions & 0 deletions libraries/htmlclean/meson.build
@@ -0,0 +1,8 @@
htmlclean_inc = include_directories('.')
htmlclean_lib = static_library(
'htmlclean',
[
'htmlclean.c'
],
dependencies: [ glib, gumbo ]
)
12 changes: 12 additions & 0 deletions meson.build
Expand Up @@ -44,6 +44,7 @@ goa = dependency('goa-1.0')
gstreamer = dependency('gstreamer-1.0')
gstreamer_pbutils = dependency('gstreamer-pbutils-1.0')
gtk = dependency('gtk+-3.0', version: '>=3.22')
gumbo = dependency('gumbo')
json_glib = dependency('json-glib-1.0')
libcurl = c_compiler.find_library('libcurl')
libnotify = dependency('libnotify')
Expand Down Expand Up @@ -85,13 +86,22 @@ constants = vcs_tag(
subdir('libraries/libgd')
subdir('libraries/libgtkimageview')
subdir('libraries/libIvy')
subdir('libraries/htmlclean')
subdir('libraries/WebExtension')
subdir('data')


# VAPI's
libgd_vapi = vala_compiler.find_library('gd-1.0', dirs: VAPI_DIR)
gtkimageview_vapi = vala_compiler.find_library('gtkimageview', dirs: VAPI_DIR)
htmlclean_vapi = vala_compiler.find_library('htmlclean', dirs: VAPI_DIR)

# htmlclean
htmlclean = declare_dependency(
link_with: htmlclean_lib,
include_directories: htmlclean_inc,
dependencies: htmlclean_vapi
)

# libgd
libgd = declare_dependency(
Expand Down Expand Up @@ -239,6 +249,7 @@ feedreader_deps = [
posix,
libivy,
libgtkimageview,
htmlclean,
libgd
]

Expand Down Expand Up @@ -279,6 +290,7 @@ executable(
],
dependencies: [
gtkimageview_vapi,
htmlclean_vapi,
webkit2gtk
],
install: true
Expand Down
8 changes: 8 additions & 0 deletions org.gnome.FeedReader.json
Expand Up @@ -68,6 +68,14 @@
"sha256": "87bc4ef307604f1ce4f09f6e5c9996ef8d37ca5e0a3bf76f6b27d71844adb40c"
}]
},
{
"name": "gumbo",
"sources": [{
"type": "archive",
"url": "https://github.com/google/gumbo-parser/archive/v0.10.1.tar.gz",
"sha256": "28463053d44a5dfbc4b77bcf49c8cee119338ffa636cc17fc3378421d714efad"
}]
},
{
"name": "feedreader",
"buildsystem": "meson",
Expand Down
7 changes: 5 additions & 2 deletions src/Utils.vala
Expand Up @@ -107,11 +107,14 @@ public class FeedReader.Utils : GLib.Object {
return "NULL";
}

string output = old_string.make_valid().replace("\n"," ").strip();
string output = old_string;
if (remove_html)
{

output = Htmlclean.strip_html(output);
}
// Strip and replace chars after HTML cleaning because the HTML cleaner
// can potentially inserting newlines, whitespace or invalid chars
output = output.make_valid().replace("\n"," ").strip();
return output;
}

Expand Down
4 changes: 4 additions & 0 deletions vapi/htmlclean.vapi
@@ -0,0 +1,4 @@
[CCode(cheader_filename = "htmlclean.h")]
namespace Htmlclean {
public static string strip_html(string text);
}

0 comments on commit 17e1813

Please sign in to comment.