Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add initial support for UTF-8 text input #8098

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions VisualStudio/fheroes2/sources.props
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
<ItemGroup>
<ClCompile Include="src\engine\agg_file.cpp" />
<ClCompile Include="src\engine\audio.cpp" />
<ClCompile Include="src\engine\char_encoding.cpp" />
<ClCompile Include="src\engine\core.cpp" />
<ClCompile Include="src\engine\dir.cpp" />
<ClCompile Include="src\engine\h2d_file.cpp" />
Expand Down Expand Up @@ -238,6 +239,7 @@
<ItemGroup>
<ClInclude Include="src\engine\agg_file.h" />
<ClInclude Include="src\engine\audio.h" />
<ClInclude Include="src\engine\char_encoding.h" />
<ClInclude Include="src\engine\core.h" />
<ClInclude Include="src\engine\dir.h" />
<ClInclude Include="src\engine\endian_h2.h" />
Expand Down
135 changes: 135 additions & 0 deletions src/engine/char_encoding.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/***************************************************************************
* fheroes2: https://github.com/ihhub/fheroes2 *
* Copyright (C) 2023 *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
***************************************************************************/

#include "char_encoding.h"

#include <array>
#include <cassert>

namespace
{
// All code page related array indexes are equal to index + 128 value in their code pages.
// For example, a first element in an array is actually value 128 in their code page.
const std::array<uint32_t, 128> cp1251CodePoints{ 0x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021, 0x20AC, 0x2030, 0x0409, 0x2039, 0x040A,
0x040C, 0x040B, 0x040F, 0x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x0000, 0x2122,
0x0459, 0x203A, 0x045A, 0x045C, 0x045B, 0x045F, 0x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6,
0x00A7, 0x0401, 0x00A9, 0x0404, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0407, 0x00B0, 0x00B1, 0x0406, 0x0456,
0x0491, 0x00B5, 0x00B6, 0x00B7, 0x0451, 0x2116, 0x0454, 0x00BB, 0x0458, 0x0405, 0x0455, 0x0457, 0x0410,
0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D,
0x041E, 0x041F, 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A,
0x042B, 0x042C, 0x042D, 0x042E, 0x042F, 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, 0x0440, 0x0441, 0x0442, 0x0443, 0x0444,
0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F };

uint8_t codePointToCP1251( const uint32_t value )
{
if ( value < 0x80 ) {
// This is an ASCII character that is a part of this code page.
return static_cast<uint8_t>( value );
}

// TODO: optimize the code to avoid looping.
for ( size_t i = 0; i < cp1251CodePoints.size(); ++i ) {
ihhub marked this conversation as resolved.
Show resolved Hide resolved
if ( cp1251CodePoints[i] == value ) {
return static_cast<uint8_t>( i + 128 );
}
}

// This is an invalid character.
return 0;
}
}

namespace Encoding
{
bool utf8ToCodePoint( const uint8_t * data, size_t length, uint32_t & codePoint )
ihhub marked this conversation as resolved.
Show resolved Hide resolved
{
if ( data == nullptr || length == 0 ) {
// Why are you trying to decode empty data?
assert( 0 );
return false;
}

if ( length > 4 ) {
// Length cannot be longer than 4.
length = 4;
}

if ( data[0] < 0x80 ) {
// This is an ASCII character. No need further processing.
codePoint = data[0];
return true;
}

if ( length < 2 || ( data[1] >> 6 ) != 2 ) {
// This is an invalid character.
return false;
}

if ( ( data[0] >> 5 ) == 6 ) {
// This is a 2 byte character.
codePoint = ( data[1] & 0x3F ) | ( data[0] & 0x1F ) << 6;
return true;
}

if ( length < 3 || ( data[2] >> 6 ) != 2 ) {
// This is an invalid character.
return false;
}

if ( ( data[0] >> 4 ) == 14 ) {
// This is a 3 byte character.
codePoint = ( data[2] & 0x3F ) | ( ( data[1] & 0x3F ) << 6 ) | ( data[0] & 0x0F ) << 12;
return true;
}

if ( length != 4 || ( data[3] >> 6 ) != 2 ) {
// This is an invalid character.
return false;
}

if ( ( data[0] >> 3 ) != 30 ) {
// This is an invalid character.
return false;
}

// This is a 4 byte character.
codePoint = ( data[3] & 0x3F ) | ( ( data[2] & 0x3F ) << 6 ) | ( ( data[1] & 0x3F ) << 12 ) | ( data[0] & 0x07 ) << 18;
return true;
}

uint8_t getCodePageCharacter( const uint32_t value, const CodePage codePage )
{
if ( codePage == CodePage::CP1251 ) {
return codePointToCP1251( value );
}

if ( value < 0x80 ) {
return static_cast<uint8_t>( value );
}

return 0;
}

bool isASCIICharacter( const uint32_t value )
{
return ( value < 0x80 );
}
}
43 changes: 43 additions & 0 deletions src/engine/char_encoding.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/***************************************************************************
* fheroes2: https://github.com/ihhub/fheroes2 *
* Copyright (C) 2023 *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
***************************************************************************/

#pragma once

#include <cstdint>

namespace Encoding
{
enum class CodePage : uint8_t
{
ASCII,
CP1250,
CP1251,
CP1252,
CP1254,
CP1258,
ISO8859_16
};

bool utf8ToCodePoint( const uint8_t * data, size_t length, uint32_t & codePoint );
ihhub marked this conversation as resolved.
Show resolved Hide resolved
ihhub marked this conversation as resolved.
Show resolved Hide resolved

uint8_t getCodePageCharacter( const uint32_t value, const CodePage codePage );

bool isASCIICharacter( const uint32_t value );
}
Loading
Loading