## Language Detection for Filtering Non-English Descriptions

Use the Python 3.11 environment locally.

In [None]:
from mediapipe.tasks import python
from mediapipe.tasks.python import text

import pandas as pd  # Locally, pandas must be loaded after mediapipe, at least in 3.11

We have a nice sample dataset to play with before we move onto the big dataset in EDA.

In [2]:
pdf_samples = pd.read_pickle("../../ABO_dataset/abo-listings-sample.pkl")
pdf_samples

Unnamed: 0_level_0,brand,bullet_point,color,fabric_type,item_name,item_weight,model_name,product_type,style,main_image_id,other_image_id,country,marketplace,domain_name,material,item_keywords,pattern,model_year,product_description,finish_type
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
B083VHXTQ1,[],"[With this firming eye mask, your skin will lo...",,,Amazon Brand:Belei Moisturizing Lifting Eye Ma...,,,ABIS_BEAUTY,[],51ZvgucquXL,"[51PSBY8h8LL, 61ZcL8JqMVL, 61ZomqVpklL, 61ykRD...",DE,Amazon,amazon.de,,[],,,,
B083VJ7P68,BELEI,[Rediscover youthful looking skin with this fi...,,,Amazon Brand - Belei - Lifting Sheet Mask with...,,,ABIS_BEAUTY,Lifting,51G7HvO94pL,"[41qhq+Q55zL, 616anjMkgcL, 51cxHmZQZuL, 61NCNb...",GB,Amazon,amazon.co.uk,,"[peel off face masks, serum, other, multimask,...",,,,
B083VJ54TY,BELEI,[Rediscover youthful looking skin with this fi...,,,Amazon Brand - Belei - Full face treatment she...,,,ABIS_BEAUTY,Full face treatment,51eC8v7TQVL,"[51195iJBIiL, 71UsrlwqWXL, 516GHxy3KEL, 61zImT...",GB,Amazon,amazon.co.uk,,"[peel off face masks, serum, other, multimask,...",,,,
B083VJ435H,Belei,[Rediscover youthful looking skin with this fi...,,,Amazon Brand - Belei - Brightening Sheet Mask ...,,,ABIS_BEAUTY,Brightening,51LoGumNTWL,"[61V1PqjdRQL, 61AbctEZQJL, 41-V96Oy40L, 611MS9...",AU,Amazon,amazon.com.au,,face wash women laneige water sleeping mask ma...,,,,
B07SDFXTS8,find.,"[Pack contains: 4 bottles x 150ml, Gently and ...",,,FIND - Gentle Eye Make-Up Remover Oil Free (4 ...,0.82,,ABIS_BEAUTY,Non-Waterproof Make Up,617AfCdQOSL,"[614O9YXh3OL, 513RvEZH2IL, 61WjBQ+zCdL, 61VZmD...",GB,Amazon,amazon.co.uk,,[kaeso eye make up remover makeup clarins clin...,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B07PQQHW9V,Amazon Brand - Solimo,"[A set of 20 White Board Markers in black(7), ...",multicoloured,,Amazon Brand - Solimo White Board Markers Set ...,299,,WRITING_INSTRUMENT,,81C+fwlZC4L,"[812KYgE73DL, 21-RCMhs5TL, 71v-G80nECL]",IN,Amazon,amazon.in,plastic,"[whiteboard markers, office markers, erasable ...",,,,
B0756H8HMV,AmazonBasics,[12-pack of gel ink pens pre-filled with black...,Black,,AmazonBasics Retractable Gel Ink Pens - Fine P...,0.022443,,WRITING_INSTRUMENT,Fine Point,61gZ4qoUGDL,"[61axV27hGjL, 81C9TttvC2L, 2113uM7xhBL, 41s1cA...",GB,Amazon,amazon.co.uk,Rubber,sakura gelly roll uniball gel impact 1.0 black...,,,,
B07PNFZ9DB,Amazon Brand - Solimo,"[A set of 20 Permanent Markers in black(5), bl...",multicoloured,,Amazon Brand - Solimo Permanent Marker Set (20...,299,,WRITING_INSTRUMENT,,81NlkVGhRzL,"[710fbJVWu8L, 91OTeaNxaSL, 21btX+LZTrL]",IN,Amazon,amazon.in,plastic,"[whiteboard markers, office markers, erasable ...",,,,
B07VX9RBXC,AmazonBasics,,Black,,AmazonBasics Low-Odor Dry Erase White Board Ma...,15.4,,WRITING_INSTRUMENT,,81bcEq1Wa+L,"[81AI0Zq01QL, 51n2DPGs2lL, 51LmU74LN3L, 61dGDW...",IN,Amazon,amazon.in,,,,,,


Language detectors take text, not a Pandas Series, so we need to convert the relevant data in each row to text.

In [3]:
def row_to_text(row):
    row_filtered = row.drop(labels=['brand', 'item_weight', 'model_name', 'product_type', 'main_image_id', 'other_image_id', 'country', 'marketplace', 'domain_name', 'model_year']).dropna()
    text = []
    for item in row_filtered:
        if isinstance(item, list):
            text.extend(item)
        else:
            text.append(item)
    
    return ' '.join(text).replace('\n', ' ')

text_for_detection = [row_to_text(pdf_samples.loc[item_id]) for item_id in pdf_samples.index]

#### Google's MediaPipe Language Detector for Edge AI

In [4]:
detection_results = {}

In [69]:
base_options = python.BaseOptions(model_asset_path="../../assets/language_detector.tflite")
options = text.LanguageDetectorOptions(base_options=base_options)
mediapipe_detector = text.LanguageDetector.create_from_options(options)

mediapipe_result = mediapipe_detector.detect('Hello, world!')

for result in mediapipe_result.detections:
  print(f'{result.language_code}: ({result.probability:.2f})')

en: (0.99)


In [6]:
detection_results['mediapipe languages'] = []
detection_results['mediapipe confidences'] = []

for item in text_for_detection:
    mediapipe_result = mediapipe_detector.detect(item).detections
    if mediapipe_result:
        detection_results['mediapipe languages'].append(mediapipe_result[0].language_code)
        detection_results['mediapipe confidences'].append(mediapipe_result[0].probability)
    else:
        detection_results['mediapipe languages'].append(None)
        detection_results['mediapipe confidences'].append(None)
    
detection_results

{'mediapipe languages': ['en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  None,
  'en',
  None,
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
  'en',
 

In [7]:
detection_results['mediapipe languages'].count(None)

7

#### Google Cloud Translate for Verification

Assume that Google Cloud Translate is the ground truth

In [8]:
def detect_language_google_cloud(text: list) -> dict:
    """Detects the text's language."""
    from google.cloud import translate_v2 as translate

    translate_client = translate.Client()

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.detect_language(text)

    return result

detect_language_google_cloud(['hello, world!', 'this is a string'])

[{'confidence': 0.7634849548339844,
  'language': 'en',
  'input': 'hello, world!'},
 {'language': 'en', 'confidence': 1, 'input': 'this is a string'}]

In [22]:
detection_results['google cloud languages'] = []
detection_results['google cloud confidences'] = []

for text_item in text_for_detection:
    google_cloud_detection = detect_language_google_cloud(text_item)
    detection_results['google cloud languages'].append(google_cloud_detection['language'])
    detection_results['google cloud confidences'].append(google_cloud_detection['confidence'])

#### Comparison of MediaPipe and Google Cloud Language Detection

In [23]:
detections_df = pd.DataFrame(detection_results)

detections_df.to_pickle("../../ABO_dataset/abo-listings-sample-language-detections.pkl")

In [46]:
mismatches = detections_df['mediapipe languages'] != detections_df['google cloud languages']
detections_df[mismatches]

Unnamed: 0,mediapipe languages,mediapipe confidences,google cloud languages,google cloud confidences
19,,,en,0.966677
21,,,en,0.985723
193,,,en,0.857004
273,en,0.886047,om,0.661082
315,ro,0.379533,en,0.29801
414,unknown,0.449822,en,0.284403
641,nl,0.648353,en,0.645524
875,de,0.484172,en,1.0
896,en,0.999258,gu-Latn,0.35835
897,en,0.999696,om,0.382614


In [58]:
pd.DataFrame(text_for_detection).iloc[detections_df[mismatches].index]

Unnamed: 0,0
19,AmazonBasics 100ml Ultrasonic Aromatherapy Ess...
21,AmazonBasics 100ml Ultrasonic Aromatherapy Ess...
193,AmazonBasics Audio Baby Monitor On Amazon Bran...
273,AmazonBasics Battery Charger for NI-MH AA/AAA ...
315,Movian Cinca
414,Cubic zirconia (CZ) [Find] Amazon Collection p...
641,AmazonBasics Urban Laptop Sleeve
875,"Saltines Crackers, Original 16 oz. Saltines Cr..."
896,100 percent natural buffalo ghee High-quality ...
897,100% natural cow ghee High-quality ghee made f...


In [44]:
len(detections_df[mismatches])

57

In [39]:
len(detections_df[detections_df['mediapipe languages']!='en'])

28

It appears that everything can be treated as English in this sample. However, out of caution, we should double-check the larger dataset. Moving forward, we can use the MediaPipe model, and verify any non-English detections against Google Cloud Language Detection.

### Other Alternatives Explored

#### Try Lingua, a Python library all on its own

A quick example of how Lingua works

In [49]:
from lingua import LanguageDetectorBuilder

lingua_detector = LanguageDetectorBuilder.from_all_languages().with_preloaded_language_models().build()
confidence_values = lingua_detector.compute_language_confidence_values("Hello, world!")
for confidence in confidence_values:
    print(f"{confidence.language.name}: {confidence.value:.2f}")

ENGLISH: 0.14
TAGALOG: 0.07
WELSH: 0.07
SOTHO: 0.05
ITALIAN: 0.05
YORUBA: 0.04
SPANISH: 0.04
ESPERANTO: 0.03
ALBANIAN: 0.03
BOKMAL: 0.03
NYNORSK: 0.03
SWEDISH: 0.03
DUTCH: 0.03
LATIN: 0.02
TSONGA: 0.02
GERMAN: 0.02
VIETNAMESE: 0.02
SHONA: 0.02
DANISH: 0.02
CATALAN: 0.02
SOMALI: 0.01
HUNGARIAN: 0.01
INDONESIAN: 0.01
FRENCH: 0.01
FINNISH: 0.01
AFRIKAANS: 0.01
POLISH: 0.01
SLOVAK: 0.01
PORTUGUESE: 0.01
ICELANDIC: 0.01
GANDA: 0.01
BASQUE: 0.01
TURKISH: 0.01
CROATIAN: 0.01
CZECH: 0.01
MAORI: 0.01
SLOVENE: 0.01
ROMANIAN: 0.01
MALAY: 0.01
ZULU: 0.01
ESTONIAN: 0.01
TSWANA: 0.01
BOSNIAN: 0.01
XHOSA: 0.01
IRISH: 0.01
LITHUANIAN: 0.01
SWAHILI: 0.00
LATVIAN: 0.00
AZERBAIJANI: 0.00
ARABIC: 0.00
ARMENIAN: 0.00
BELARUSIAN: 0.00
BENGALI: 0.00
BULGARIAN: 0.00
CHINESE: 0.00
GEORGIAN: 0.00
GREEK: 0.00
GUJARATI: 0.00
HEBREW: 0.00
HINDI: 0.00
JAPANESE: 0.00
KAZAKH: 0.00
KOREAN: 0.00
MACEDONIAN: 0.00
MARATHI: 0.00
MONGOLIAN: 0.00
PERSIAN: 0.00
PUNJABI: 0.00
RUSSIAN: 0.00
SERBIAN: 0.00
TAMIL: 0.00
TELUGU: 0.

Check dataset for non-English detections

In [60]:
non_eng_count = 0
for i, text_item in enumerate(text_for_detection):
    confidence_values = lingua_detector.compute_language_confidence_values(text_item)
    language = confidence_values[0].language.name
    if language != 'ENGLISH':
        non_eng_count += 1
        print(f"{i}: {confidence_values[0].language.name}: {confidence_values[0].value:.2f}")
        
print(f'Non-English count = {non_eng_count}')

133: GERMAN: 1.00
143: GERMAN: 1.00
315: INDONESIAN: 0.14
410: YORUBA: 0.48
414: SHONA: 0.95
628: HUNGARIAN: 0.15
800: GERMAN: 0.45
802: GERMAN: 0.45
807: NYNORSK: 0.14
920: TSONGA: 0.08
972: DUTCH: 1.00
1065: DUTCH: 1.00
1210: SWAHILI: 0.16
1223: GERMAN: 0.22
1324: LATIN: 0.48
1378: LATIN: 0.09
1535: FRENCH: 0.30
1545: LATIN: 0.31
1624: TAGALOG: 0.96
2133: TAGALOG: 0.81
2256: ESPERANTO: 0.27
2283: DANISH: 0.74
2689: GERMAN: 0.14
2693: GERMAN: 0.10
2694: GERMAN: 0.10
2797: XHOSA: 0.76
2798: YORUBA: 0.47
2800: LATIN: 0.28
2801: LATIN: 0.29
2802: YORUBA: 0.34
2803: LATIN: 0.75
2804: YORUBA: 0.33
2806: YORUBA: 0.74
2808: GERMAN: 0.52
2809: GERMAN: 0.61
2963: LATIN: 1.00
3102: GERMAN: 0.07
3164: GERMAN: 1.00
3278: GERMAN: 0.19
3288: NYNORSK: 0.15
3582: GERMAN: 0.10
3621: GERMAN: 1.00
3719: LATIN: 0.20
Non-English count = 43


Conclusion: Despite being much larger than the MediaPipe model, Lingua appears to perform worse.

#### Trying FastText from Meta

In [40]:
import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
fasttext_model = fasttext.load_model(model_path)

A quick example of how FastText works

In [41]:
fasttext_model.predict("Hello, world!", k=3)

(('__label__eng_Latn', '__label__vie_Latn', '__label__nld_Latn'),
 array([0.61224753, 0.21323682, 0.09696738]))

Check dataset for non-English detections

In [67]:
fasttext_predictions = fasttext_model.predict(text_for_detection)

non_eng_count = 0
for i, pred in enumerate(zip(*fasttext_predictions)):
    if pred[0][0] != '__label__eng_Latn':
        non_eng_count += 1
        print(f"{i}: {pred[0][0]}: {pred[1][0]}")
        
print(f'Non-English count = {non_eng_count}')

6: __label__kor_Hang: 0.5559741258621216
19: __label__kor_Hang: 0.1623351275920868
21: __label__kor_Hang: 0.16395673155784607
24: __label__kor_Hang: 0.3345915973186493
26: __label__kor_Hang: 0.3345915973186493
27: __label__kor_Hang: 0.4519205689430237
29: __label__kor_Hang: 0.3345915973186493
30: __label__kor_Hang: 0.4797893762588501
31: __label__kor_Hang: 0.3345915973186493
32: __label__kor_Hang: 0.3345915973186493
36: __label__deu_Latn: 0.31281569600105286
55: __label__yue_Hant: 0.7973456382751465
57: __label__deu_Latn: 0.2974889278411865
63: __label__kor_Hang: 0.3605898320674896
66: __label__yue_Hant: 0.6135532259941101
68: __label__kor_Hang: 0.3314158320426941
102: __label__kor_Hang: 0.7089266180992126
133: __label__deu_Latn: 0.44995924830436707
143: __label__deu_Latn: 0.8012063503265381
144: __label__yue_Hant: 0.5971240997314453
151: __label__kor_Hang: 0.3811558485031128
158: __label__krc_Cyrl: 0.49816566705703735
159: __label__krc_Cyrl: 0.5081764459609985
160: __label__krc_Cyrl: 

Conclusion: FastText performs worst.