Skip to content

Commit

Permalink
feat(bigquery/storage/managedwriter): enable field name indirection (g…
Browse files Browse the repository at this point in the history
…oogleapis#6247)

This PR wires in validation testing for extension-based naming indirection, 
and adds support for rich names to the adapt subpackage.  

This feature is still in preview, but this will enable use for customers enrolled in the preview.

Because this change effectively allows special characters in field names, this PR also
augments testing utilities that emit SQL to escape said identifiers.
  • Loading branch information
shollyman committed Oct 4, 2022
1 parent 373d2fc commit 1969273
Show file tree
Hide file tree
Showing 7 changed files with 498 additions and 289 deletions.
78 changes: 50 additions & 28 deletions bigquery/storage/managedwriter/adapt/protoconversion.go
Expand Up @@ -286,46 +286,68 @@ func storageSchemaToDescriptorInternal(inSchema *storagepb.TableSchema, scope st
//
// Messages are always nullable, and repeated fields are as well.
func tableFieldSchemaToFieldDescriptorProto(field *storagepb.TableFieldSchema, idx int32, scope string, useProto3 bool) (*descriptorpb.FieldDescriptorProto, error) {

name := strings.ToLower(field.GetName())
var fdp *descriptorpb.FieldDescriptorProto

if field.GetType() == storagepb.TableFieldSchema_STRUCT {
return &descriptorpb.FieldDescriptorProto{
fdp = &descriptorpb.FieldDescriptorProto{
Name: proto.String(name),
Number: proto.Int32(idx),
TypeName: proto.String(scope),
Label: convertModeToLabel(field.GetMode(), useProto3),
}, nil
}

// For (REQUIRED||REPEATED) fields for proto3, or all cases for proto2, we can use the expected scalar types.
if field.GetMode() != storagepb.TableFieldSchema_NULLABLE || !useProto3 {
outType := bqTypeToFieldTypeMap[field.GetType()]
fdp := &descriptorpb.FieldDescriptorProto{
Name: proto.String(name),
Number: proto.Int32(idx),
Type: outType.Enum(),
Label: convertModeToLabel(field.GetMode(), useProto3),
}
// Special case: proto2 repeated fields may benefit from using packed annotation.
if field.GetMode() == storagepb.TableFieldSchema_REPEATED && !useProto3 {
for _, v := range packedTypes {
if outType == v {
fdp.Options = &descriptorpb.FieldOptions{
Packed: proto.Bool(true),
} else {
// For (REQUIRED||REPEATED) fields for proto3, or all cases for proto2, we can use the expected scalar types.
if field.GetMode() != storagepb.TableFieldSchema_NULLABLE || !useProto3 {
outType := bqTypeToFieldTypeMap[field.GetType()]
fdp = &descriptorpb.FieldDescriptorProto{
Name: proto.String(name),
Number: proto.Int32(idx),
Type: outType.Enum(),
Label: convertModeToLabel(field.GetMode(), useProto3),
}

// Special case: proto2 repeated fields may benefit from using packed annotation.
if field.GetMode() == storagepb.TableFieldSchema_REPEATED && !useProto3 {
for _, v := range packedTypes {
if outType == v {
fdp.Options = &descriptorpb.FieldOptions{
Packed: proto.Bool(true),
}
break
}
break
}
}
} else {
// For NULLABLE proto3 fields, use a wrapper type.
fdp = &descriptorpb.FieldDescriptorProto{
Name: proto.String(name),
Number: proto.Int32(idx),
Type: descriptorpb.FieldDescriptorProto_TYPE_MESSAGE.Enum(),
TypeName: proto.String(bqTypeToWrapperMap[field.GetType()]),
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(),
}
}
}
if nameRequiresAnnotation(name) {
// Use a prefix + base64 encoded name when annotations bear the actual name.
// Base 64 standard encoding may also contain certain characters (+,/,=) which
// we remove from the generated name.
encoded := strings.Trim(base64.StdEncoding.EncodeToString([]byte(name)), "+/=")
fdp.Name = proto.String(fmt.Sprintf("col_%s", encoded))
opts := fdp.GetOptions()
if opts == nil {
fdp.Options = &descriptorpb.FieldOptions{}
}
return fdp, nil
proto.SetExtension(fdp.Options, storagepb.E_ColumnName, name)
}
// For NULLABLE proto3 fields, use a wrapper type.
return &descriptorpb.FieldDescriptorProto{
Name: proto.String(name),
Number: proto.Int32(idx),
Type: descriptorpb.FieldDescriptorProto_TYPE_MESSAGE.Enum(),
TypeName: proto.String(bqTypeToWrapperMap[field.GetType()]),
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(),
}, nil
return fdp, nil
}

// nameRequiresAnnotation determines whether a field name requires unicode-annotation.
func nameRequiresAnnotation(in string) bool {
return !protoreflect.Name(in).IsValid()
}

// NormalizeDescriptor builds a self-contained DescriptorProto suitable for communicating schema
Expand Down
53 changes: 53 additions & 0 deletions bigquery/storage/managedwriter/adapt/protoconversion_test.go
Expand Up @@ -413,6 +413,59 @@ func TestSchemaToProtoConversion(t *testing.T) {
},
},
},
{
description: "indirect names",
bq: &storagepb.TableSchema{
Fields: []*storagepb.TableFieldSchema{
{Name: "foo", Type: storagepb.TableFieldSchema_STRING, Mode: storagepb.TableFieldSchema_NULLABLE},
{Name: "火", Type: storagepb.TableFieldSchema_INT64, Mode: storagepb.TableFieldSchema_REQUIRED},
{Name: "水_addict", Type: storagepb.TableFieldSchema_BYTES, Mode: storagepb.TableFieldSchema_REPEATED},
{Name: "0col", Type: storagepb.TableFieldSchema_INT64, Mode: storagepb.TableFieldSchema_NULLABLE},
{Name: "funny-name", Type: storagepb.TableFieldSchema_INT64, Mode: storagepb.TableFieldSchema_NULLABLE},
}},
wantProto2: func() *descriptorpb.DescriptorProto {
dp := &descriptorpb.DescriptorProto{
Name: proto.String("root"),
Field: []*descriptorpb.FieldDescriptorProto{
{
Name: proto.String("foo"),
Number: proto.Int32(1),
Type: descriptorpb.FieldDescriptorProto_TYPE_STRING.Enum(),
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum()},
{
Name: proto.String("col_54Gr"),
Number: proto.Int32(2),
Type: descriptorpb.FieldDescriptorProto_TYPE_INT64.Enum(),
Options: &descriptorpb.FieldOptions{},
Label: descriptorpb.FieldDescriptorProto_LABEL_REQUIRED.Enum()},
{
Name: proto.String("col_5rC0X2FkZGljdA"),
Number: proto.Int32(3),
Type: descriptorpb.FieldDescriptorProto_TYPE_BYTES.Enum(),
Options: &descriptorpb.FieldOptions{},
Label: descriptorpb.FieldDescriptorProto_LABEL_REPEATED.Enum(),
},
{
Name: proto.String("col_MGNvbA"),
Number: proto.Int32(4),
Type: descriptorpb.FieldDescriptorProto_TYPE_INT64.Enum(),
Options: &descriptorpb.FieldOptions{},
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum()},
{
Name: proto.String("col_ZnVubnktbmFtZQ"),
Number: proto.Int32(5),
Type: descriptorpb.FieldDescriptorProto_TYPE_INT64.Enum(),
Options: &descriptorpb.FieldOptions{},
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum()},
},
}
proto.SetExtension(dp.Field[1].Options, storagepb.E_ColumnName, "火")
proto.SetExtension(dp.Field[2].Options, storagepb.E_ColumnName, "水_addict")
proto.SetExtension(dp.Field[3].Options, storagepb.E_ColumnName, "0col")
proto.SetExtension(dp.Field[4].Options, storagepb.E_ColumnName, "funny-name")
return dp
}(),
},
}
for _, tc := range testCases {
// Proto2
Expand Down
15 changes: 15 additions & 0 deletions bigquery/storage/managedwriter/testdata/schemas.go
Expand Up @@ -257,4 +257,19 @@ var (
Repeated: true,
},
}

ValidationColumnAnnotations bigquery.Schema = bigquery.Schema{
{
Name: "first",
Type: bigquery.StringFieldType,
},
{
Name: "second",
Type: bigquery.StringFieldType,
},
{
Name: "特別コラム",
Type: bigquery.StringFieldType,
},
}
)

0 comments on commit 1969273

Please sign in to comment.