diff --git a/backend/src/services/aws.ts b/backend/src/services/aws.ts index 67631e1fb2..ab83beb486 100644 --- a/backend/src/services/aws.ts +++ b/backend/src/services/aws.ts @@ -1,4 +1,5 @@ import AWS, { SQS } from 'aws-sdk' +import { trimUtf8ToMaxByteLength } from '@crowd/common' import { COMPREHEND_CONFIG, IS_DEV_ENV, KUBE_MODE, S3_CONFIG, SQS_CONFIG } from '../conf' let sqsInstance @@ -100,22 +101,6 @@ if (KUBE_MODE) { : undefined } -const trimUtf8ToMaxByteLength = (utf8Str: string, maxByteLength: number): string => { - if (Buffer.byteLength(utf8Str, 'utf8') > maxByteLength) { - // this will get us close but some characters could be multibyte encoded so we might need to trim a bit more - utf8Str = utf8Str.slice(0, maxByteLength) - } - - // trim till we get to the requested byte length or lower (if we cut multibyte character) - let byteLength = Buffer.byteLength(utf8Str, 'utf8') - while (byteLength > maxByteLength) { - utf8Str = utf8Str.slice(0, -1) - byteLength = Buffer.byteLength(utf8Str, 'utf8') - } - - return utf8Str -} - const ALLOWED_MAX_BYTE_LENGTH = 5000 /** diff --git a/services/apps/search_sync_worker/src/service/member.sync.service.ts b/services/apps/search_sync_worker/src/service/member.sync.service.ts index 88ba22809f..a0155ec115 100644 --- a/services/apps/search_sync_worker/src/service/member.sync.service.ts +++ b/services/apps/search_sync_worker/src/service/member.sync.service.ts @@ -2,7 +2,7 @@ import { SERVICE_CONFIG } from '@/conf' import { IDbMemberSyncData } from '@/repo/member.data' import { MemberRepository } from '@/repo/member.repo' import { OpenSearchIndex } from '@/types' -import { distinct, distinctBy, groupBy } from '@crowd/common' +import { distinct, distinctBy, groupBy, trimUtf8ToMaxByteLength } from '@crowd/common' import { DbStore } from '@crowd/database' import { Logger, LoggerBase, logExecutionTime } from '@crowd/logging' import { RedisClient } from '@crowd/redis' @@ -433,13 +433,17 @@ export class MemberSyncService extends LoggerBase { p.keyword_displayName = data.displayName const p_attributes = {} + // max byte length that can be indexed in OpenSearch + const maxByteLength = 25000 + for (const attribute of attributes) { // eslint-disable-next-line @typescript-eslint/no-explicit-any const attData = data.attributes as any if (attribute.name in attData) { if (attribute.type === MemberAttributeType.SPECIAL) { - const data = JSON.stringify(attData[attribute.name]) + let data = JSON.stringify(attData[attribute.name]) + data = trimUtf8ToMaxByteLength(data, maxByteLength) p_attributes[`string_${attribute.name}`] = data } else { const p_data = {} @@ -447,7 +451,11 @@ export class MemberSyncService extends LoggerBase { const prefix = this.attributeTypeToOpenSearchPrefix(defValue, attribute.type) for (const key of Object.keys(attData[attribute.name])) { - p_data[`${prefix}_${key}`] = attData[attribute.name][key] + let value = attData[attribute.name][key] + if (attribute.type === MemberAttributeType.STRING) { + value = trimUtf8ToMaxByteLength(value, maxByteLength) + } + p_data[`${prefix}_${key}`] = value } p_attributes[`obj_${attribute.name}`] = p_data diff --git a/services/libs/common/src/byteLength.ts b/services/libs/common/src/byteLength.ts new file mode 100644 index 0000000000..7c15cd71d3 --- /dev/null +++ b/services/libs/common/src/byteLength.ts @@ -0,0 +1,15 @@ +export const trimUtf8ToMaxByteLength = (utf8Str: string, maxByteLength: number): string => { + if (Buffer.byteLength(utf8Str, 'utf8') > maxByteLength) { + // this will get us close but some characters could be multibyte encoded so we might need to trim a bit more + utf8Str = utf8Str.slice(0, maxByteLength) + } + + // trim till we get to the requested byte length or lower (if we cut multibyte character) + let byteLength = Buffer.byteLength(utf8Str, 'utf8') + while (byteLength > maxByteLength) { + utf8Str = utf8Str.slice(0, -1) + byteLength = Buffer.byteLength(utf8Str, 'utf8') + } + + return utf8Str +} diff --git a/services/libs/common/src/index.ts b/services/libs/common/src/index.ts index a1f5a4a22a..e60b3340cb 100644 --- a/services/libs/common/src/index.ts +++ b/services/libs/common/src/index.ts @@ -9,3 +9,4 @@ export * from './strings' export * from './types' export * from './requestThrottler' export * from './rawQueryParser' +export * from './byteLength' diff --git a/services/libs/sentiment/src/sentiment.ts b/services/libs/sentiment/src/sentiment.ts index 980dc7eacb..56b95a78c7 100644 --- a/services/libs/sentiment/src/sentiment.ts +++ b/services/libs/sentiment/src/sentiment.ts @@ -8,6 +8,7 @@ import { IS_DEV_ENV } from '@crowd/common' import { getServiceChildLogger } from '@crowd/logging' import { getComprehendClient } from './client' import { ISentimentAnalysisResult, ISentimentClientConfig } from './types' +import { trimUtf8ToMaxByteLength } from '@crowd/common' const log = getServiceChildLogger('sentiment') @@ -134,22 +135,6 @@ const mapResult = (result: DetectSentimentResponse): ISentimentAnalysisResult => } } -const trimUtf8ToMaxByteLength = (utf8Str: string, maxByteLength: number): string => { - if (Buffer.byteLength(utf8Str, 'utf8') > maxByteLength) { - // this will get us close but some characters could be multibyte encoded so we might need to trim a bit more - utf8Str = utf8Str.slice(0, maxByteLength) - } - - // trim till we get to the requested byte length or lower (if we cut multibyte character) - let byteLength = Buffer.byteLength(utf8Str, 'utf8') - while (byteLength > maxByteLength) { - utf8Str = utf8Str.slice(0, -1) - byteLength = Buffer.byteLength(utf8Str, 'utf8') - } - - return utf8Str -} - const ALLOWED_MAX_BYTE_LENGTH = 5000 const prepareText = (text: string): string => {