Skip to content

Commit

Permalink
Merge branch 'b-tidyverse#2441-mojibake-error' into master-nosquash
Browse files Browse the repository at this point in the history
  • Loading branch information
krlmlr committed Mar 21, 2017
2 parents b9156e3 + 0663ce7 commit dfe5add
Show file tree
Hide file tree
Showing 32 changed files with 124 additions and 97 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
@@ -1,5 +1,8 @@
# dplyr 0.5.0.9000

* Error messages and explanations of data frame inequality are now encoded in
UTF-8, also on Windows (#2441).

* Breaking change: `xxx_join.tbl_df()` by default treats all `NA` values as
different from each other (and from any other value), so that they never
match. This corresponds to the behavior of joins for database sources,
Expand Down
6 changes: 3 additions & 3 deletions inst/include/dplyr/Collecter.h
Expand Up @@ -293,7 +293,7 @@ namespace dplyr {
}

std::string describe() const {
return collapse<STRSXP>(types);
return collapse_utf8<STRSXP>(types);
}

private:
Expand Down Expand Up @@ -333,7 +333,7 @@ namespace dplyr {
}

std::string describe() const {
return collapse<STRSXP>(get_time_classes());
return collapse_utf8<STRSXP>(get_time_classes());
}

private:
Expand Down Expand Up @@ -388,7 +388,7 @@ namespace dplyr {
}

std::string describe() const {
return collapse<STRSXP>(types);
return collapse_utf8<STRSXP>(types);
}

private:
Expand Down
2 changes: 1 addition & 1 deletion inst/include/dplyr/DataFrameColumnSubsetVisitor.h
Expand Up @@ -41,7 +41,7 @@ namespace dplyr {
return "data.frame";
}

inline bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const std::string&) const {
inline bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const {
return is_same_typeid(other);
}

Expand Down
20 changes: 10 additions & 10 deletions inst/include/dplyr/DataFrameJoinVisitors.h
Expand Up @@ -18,22 +18,22 @@ namespace dplyr {
typedef JoinVisitor visitor_type;

DataFrameJoinVisitors(
const Rcpp::DataFrame& left_,
const Rcpp::DataFrame& right_,
Rcpp::CharacterVector names_left,
Rcpp::CharacterVector names_right,
const DataFrame& left_,
const DataFrame& right_,
const SymbolVector& names_left,
const SymbolVector& names_right,
bool warn_,
bool na_match
);

inline JoinVisitor* get(int k) const {
return visitors[k];
}
inline JoinVisitor* get(String name) const {
inline JoinVisitor* get(const SymbolString& name) const {
for (int i=0; i<nvisitors; i++) {
if (name == visitor_names_left[i]) return get(i);
}
stop("visitor not found for name '%s' ", name.get_cstring());
stop("visitor not found for name '%s' ", name.get_utf8_cstring());
}
inline int size() const {
return nvisitors;
Expand All @@ -53,18 +53,18 @@ namespace dplyr {
return (SEXP)out;
}

const CharacterVector& left_names() const {
const SymbolVector& left_names() const {
return visitor_names_left;
}
const CharacterVector& right_names() const {
const SymbolVector& right_names() const {
return visitor_names_right;
}

private:
const DataFrame& left;
const DataFrame& right;
CharacterVector visitor_names_left;
CharacterVector visitor_names_right;
SymbolVector visitor_names_left;
SymbolVector visitor_names_right;

int nvisitors;
pointer_vector<JoinVisitor> visitors;
Expand Down
2 changes: 1 addition & 1 deletion inst/include/dplyr/DataFrameSubsetVisitors.h
Expand Up @@ -48,7 +48,7 @@ namespace dplyr {

int pos = indx[i];
if (pos == NA_INTEGER) {
stop("unknown column '%s' ", names[i].get_cstring());
stop("unknown column '%s' ", names[i].get_utf8_cstring());
}

SubsetVectorVisitor* v = subset_visitor(data[pos - 1]);
Expand Down
2 changes: 1 addition & 1 deletion inst/include/dplyr/Gatherer.h
Expand Up @@ -97,7 +97,7 @@ namespace dplyr {
} else {
stop(
"Can not automatically convert from %s to %s in column \"%s\".",
coll->describe(), get_single_class(subset), name.get_cstring()
coll->describe(), get_single_class(subset), name.get_utf8_cstring()
);
}
}
Expand Down
2 changes: 1 addition & 1 deletion inst/include/dplyr/JoinVisitor.h
Expand Up @@ -19,7 +19,7 @@ namespace dplyr {

};

JoinVisitor* join_visitor(SEXP left, SEXP right, const std::string& left_name, const std::string& right_name, bool warn, bool accept_na_match = true);
JoinVisitor* join_visitor(SEXP left, SEXP right, const SymbolString& left_name, const SymbolString& right_name, bool warn, bool accept_na_match = true);

}

Expand Down
2 changes: 1 addition & 1 deletion inst/include/dplyr/MatrixColumnSubsetVectorVisitor.h
Expand Up @@ -68,7 +68,7 @@ namespace dplyr {
return "matrix";
}

inline bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const std::string&) const {
inline bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const {
return is_same_typeid(other);
}

Expand Down
4 changes: 2 additions & 2 deletions inst/include/dplyr/SubsetVectorVisitor.h
Expand Up @@ -40,11 +40,11 @@ namespace dplyr {
return typeid(*other) == typeid(*this);
}

virtual bool is_same_type(SubsetVectorVisitor* other, std::stringstream&, const std::string&) const {
virtual bool is_same_type(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const {
return is_same_typeid(other);
}

virtual bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const std::string&) const = 0;
virtual bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const = 0;

};

Expand Down
23 changes: 13 additions & 10 deletions inst/include/dplyr/SubsetVectorVisitorImpl.h
Expand Up @@ -71,7 +71,7 @@ namespace dplyr {
return vec.size();
}

inline bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const std::string&) const {
inline bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const {
return is_same_typeid(other);
}

Expand Down Expand Up @@ -141,23 +141,23 @@ namespace dplyr {

inline std::string get_r_type() const {
CharacterVector classes = get_class(Parent::vec);
return collapse(classes);
return collapse_utf8(classes);
}

inline bool is_same_type(SubsetVectorVisitor* other, std::stringstream& ss, const std::string& name) const {
inline bool is_same_type(SubsetVectorVisitor* other, std::stringstream& ss, const SymbolString& name) const {
return is_same_typeid(other) && same_levels(dynamic_cast<SubsetFactorVisitor*>(other), ss, name);
}

inline bool is_compatible(SubsetVectorVisitor* other, std::stringstream& ss, const std::string& name) const {
inline bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const {
return is_same_typeid(other) || (typeid(*other) == typeid(SubsetVectorVisitorImpl<STRSXP>));
}

private:
inline bool same_levels(SubsetFactorVisitor* other, std::stringstream& ss, const std::string& name) const {
inline bool same_levels(SubsetFactorVisitor* other, std::stringstream& ss, const SymbolString& name) const {
CharacterVector levels_other = other->levels;

if (!character_vector_equal(levels, levels_other)) {
ss << "Factor levels not equal for column '" << name << "'";
ss << "Factor levels not equal for column '" << name.get_utf8_cstring() << "'";
return false;
}
return true;
Expand Down Expand Up @@ -222,7 +222,7 @@ namespace dplyr {
return impl->get_r_type();
}

bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const std::string&) const {
bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const {
return is_same_typeid(other);
}

Expand All @@ -233,17 +233,20 @@ namespace dplyr {
};

template <>
inline bool SubsetVectorVisitorImpl<INTSXP>::is_compatible(SubsetVectorVisitor* other, std::stringstream&, const std::string&) const {
inline bool SubsetVectorVisitorImpl<INTSXP>::is_compatible(SubsetVectorVisitor* other, std::stringstream&,
const SymbolString&) const {
return is_same_typeid(other) || typeid(*other) == typeid(SubsetVectorVisitorImpl<REALSXP>);
}

template <>
inline bool SubsetVectorVisitorImpl<REALSXP>::is_compatible(SubsetVectorVisitor* other, std::stringstream&, const std::string&) const {
inline bool SubsetVectorVisitorImpl<REALSXP>::is_compatible(SubsetVectorVisitor* other, std::stringstream&,
const SymbolString&) const {
return is_same_typeid(other) || typeid(*other) == typeid(SubsetVectorVisitorImpl<INTSXP>);
}

template <>
inline bool SubsetVectorVisitorImpl<STRSXP>::is_compatible(SubsetVectorVisitor* other, std::stringstream&, const std::string&) const {
inline bool SubsetVectorVisitorImpl<STRSXP>::is_compatible(SubsetVectorVisitor* other, std::stringstream&,
const SymbolString&) const {
return is_same_typeid(other) || typeid(*other) == typeid(SubsetFactorVisitor);
}

Expand Down
2 changes: 1 addition & 1 deletion inst/include/dplyr/VectorVisitorImpl.h
Expand Up @@ -135,7 +135,7 @@ namespace dplyr {

inline std::string get_r_type() const {
CharacterVector classes = get_class(Parent::vec);
return collapse(classes);
return collapse_utf8(classes);
}

private:
Expand Down
2 changes: 1 addition & 1 deletion inst/include/dplyr/checks.h
Expand Up @@ -72,7 +72,7 @@ namespace dplyr {
if (name.is_empty()) {
Rcpp::stop("Unsupported type %s", type_name(x));
} else {
Rcpp::stop("Column `%s` must be a vector, not a %s", name.get_cstring(), type_name(x));
Rcpp::stop("Column `%s` must be a vector, not a %s", name.get_utf8_cstring(), type_name(x));
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion inst/include/tools/SymbolMap.h
Expand Up @@ -81,7 +81,7 @@ namespace dplyr {
int get(const SymbolString& name) const {
SymbolMapIndex index = get_index(name);
if (index.origin == NEW) {
stop("variable '%s' not found", name.get_cstring());
stop("variable '%s' not found", name.get_utf8_cstring());
}
return index.pos;
}
Expand Down
11 changes: 9 additions & 2 deletions inst/include/tools/SymbolString.h
Expand Up @@ -27,8 +27,11 @@ namespace dplyr {
return Symbol(Rf_translateChar(s.get_sexp()));
}

const std::string get_cstring() const {
return s.get_cstring();
const std::string get_utf8_cstring() const {
static Environment rlang = Environment::namespace_env("rlang");
static Function as_string = Function("as_string", rlang);
SEXP utf8_string = as_string(Rf_lang2(R_QuoteSymbol, get_symbol()));
return CHAR(STRING_ELT(utf8_string, 0));
}

const bool is_empty() const {
Expand All @@ -39,6 +42,10 @@ namespace dplyr {
return s.get_sexp();
}

bool operator==(const SymbolString& other) const {
return Rf_NonNullStringMatch(get_sexp(), other.get_sexp());
}

private:
String s;
};
Expand Down
14 changes: 7 additions & 7 deletions inst/include/tools/collapse.h
@@ -1,22 +1,22 @@
#ifndef dplyr_collapse_H
#define dplyr_collapse_H

namespace Rcpp {
namespace dplyr {

template <int RTYPE>
const char* toString(typename ::Rcpp::traits::storage_type<RTYPE>::type from) {
SEXP s = internal::r_coerce<RTYPE,STRSXP>(from);
return CHAR(s);
const char* to_string_utf8(typename Rcpp::traits::storage_type<RTYPE>::type from) {
SEXP s = Rcpp::internal::r_coerce<RTYPE,STRSXP>(from);
return Rf_translateCharUTF8(s);
}

template <int RTYPE>
std::string collapse(const Vector<RTYPE>& x, const char* sep = ", ") {
std::string collapse_utf8(const Vector<RTYPE>& x, const char* sep = ", ") {
std::stringstream ss;
int n = x.size();
if (n > 0) {
ss << toString<RTYPE>(x[0]);
ss << to_string_utf8<RTYPE>(x[0]);
for (int i=1; i<n; i++) {
const char* st = toString<RTYPE>(x[i]);
const char* st = to_string_utf8<RTYPE>(x[i]);
ss << sep << st;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/Makevars
@@ -1,5 +1,5 @@
# Disable long types from C99 or CPP11 extensions
PKG_CPPFLAGS = -I../inst/include -DCOMPILING_DPLYR -DBOOST_NO_INT64_T -DBOOST_NO_INTEGRAL_INT64_T -DBOOST_NO_LONG_LONG
PKG_CPPFLAGS = -I../inst/include -DCOMPILING_DPLYR -DBOOST_NO_INT64_T -DBOOST_NO_INTEGRAL_INT64_T -DBOOST_NO_LONG_LONG -DRCPP_USING_UTF8_ERROR_STRING

all: $(SHLIB)

Expand Down
2 changes: 1 addition & 1 deletion src/Makevars.win
@@ -1 +1 @@
PKG_CPPFLAGS = -I../inst/include -DCOMPILING_DPLYR
PKG_CPPFLAGS = -I../inst/include -DCOMPILING_DPLYR -DRCPP_USING_UTF8_ERROR_STRING
18 changes: 8 additions & 10 deletions src/api.cpp
Expand Up @@ -43,7 +43,7 @@ namespace dplyr {

for (int i=0; i<n; i++) {
if (indices[i] == NA_INTEGER) {
stop("unknown column '%s' ", names[i].get_cstring());
stop("unknown column '%s' ", names[i].get_utf8_cstring());
}
SEXP column = data[indices[i]-1];
visitors.push_back(visitor(column));
Expand All @@ -58,28 +58,26 @@ namespace dplyr {
copy_vars(x, data);
}

DataFrameJoinVisitors::DataFrameJoinVisitors(const Rcpp::DataFrame& left_, const Rcpp::DataFrame& right_, Rcpp::CharacterVector names_left, Rcpp::CharacterVector names_right, bool warn_, bool na_match) :
DataFrameJoinVisitors::DataFrameJoinVisitors(const DataFrame& left_, const DataFrame& right_, const SymbolVector& names_left, const SymbolVector& names_right, bool warn_, bool na_match) :
left(left_), right(right_),
visitor_names_left(names_left),
visitor_names_right(names_right),
nvisitors(names_left.size()),
visitors(nvisitors),
warn(warn_)
{
std::string name_left, name_right;

IntegerVector indices_left = r_match(names_left, RCPP_GET_NAMES(left));
IntegerVector indices_right = r_match(names_right, RCPP_GET_NAMES(right));
IntegerVector indices_left = names_left.match_in_table(RCPP_GET_NAMES(left));
IntegerVector indices_right = names_right.match_in_table(RCPP_GET_NAMES(right));

for (int i=0; i<nvisitors; i++) {
name_left = names_left[i];
name_right = names_right[i];
const SymbolString& name_left = names_left[i];
const SymbolString& name_right = names_right[i];

if (indices_left[i] == NA_INTEGER) {
stop("'%s' column not found in lhs, cannot join", name_left);
stop("'%s' column not found in lhs, cannot join", name_left.get_utf8_cstring());
}
if (indices_right[i] == NA_INTEGER) {
stop("'%s' column not found in rhs, cannot join", name_right);
stop("'%s' column not found in rhs, cannot join", name_right.get_utf8_cstring());
}

visitors[i] = join_visitor(left[indices_left[i]-1], right[indices_right[i]-1], name_left, name_right, warn, na_match);
Expand Down
3 changes: 2 additions & 1 deletion src/filter.cpp
Expand Up @@ -3,6 +3,7 @@
#include <tools/hash.h>
#include <tools/Quosure.h>
#include <tools/utils.h>
#include <tools/SymbolString.h>

#include <dplyr/GroupedDataFrame.h>

Expand Down Expand Up @@ -48,7 +49,7 @@ SEXP assert_correct_filter_subcall(SEXP x, const SymbolSet& set, const Environme
} else if (x == Rf_install("F")) {
return Rf_ScalarLogical(FALSE);
}
stop("unknown column : %s", CHAR(PRINTNAME(x)));
stop("unknown column : %s", SymbolString(Symbol(x)).get_utf8_cstring());
}
return res;
}
Expand Down
4 changes: 2 additions & 2 deletions src/group_indices.cpp
Expand Up @@ -46,15 +46,15 @@ DataFrame build_index_cpp(DataFrame data) {
for (int i = 0; i < nvars; ++i) {
int pos = indx[i];
if (pos == NA_INTEGER) {
stop("unknown column '%s' ", vars[i].get_cstring());
stop("unknown column '%s' ", vars[i].get_utf8_cstring());
}

SEXP v = data[pos-1];

if (!white_list(v) || TYPEOF(v) == VECSXP) {
stop(
"cannot group column %s, of class '%s'",
vars[i].get_cstring(),
vars[i].get_utf8_cstring(),
get_single_class(v));
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/hybrid.cpp
Expand Up @@ -157,7 +157,7 @@ namespace dplyr {
} else if (TYPEOF(call) == SYMSXP) {
SymbolString name = SymbolString(Symbol(call));

LOG_VERBOSE << "Searching hybrid handler for symbol " << name.get_cstring();
LOG_VERBOSE << "Searching hybrid handler for symbol " << name.get_utf8_cstring();

if (subsets.count(name)) {
LOG_VERBOSE << "Using hybrid variable handler";
Expand Down

0 comments on commit dfe5add

Please sign in to comment.