# Language detection of text
- Detect the language of a text with the help of the langdetect library
- Our aim is to distinguish between English and German texts
- Test the methods in that notebook before integrating them into the main code
- URL of Package: [https://pypi.org/project/langdetect/]

## Install and import the necessary packages

In [1]:
!pip install langdetect

You should consider upgrading via the '/Users/leoncena/Python/venvs/ps-research-map-venv-new/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
# new package
import langdetect
# usual packages
import os
import json
import pandas as pd
import deepl



## Load data for testing

In [33]:
publication = pd.read_csv('wi_df_final_clean_keyword.csv',encoding='utf8')

# filter rows with abstracts and keywords
publication_filtered  = publication[publication['cfAbstr'].notna() & publication['keywords'].notna()]
# id column should be string
publication_filtered['id'] = publication_filtered['id'].astype(str)

# quick eda
print(f'Number of publications: {publication_filtered.shape[0]}')
print(f'Number of features in df: {publication_filtered.shape[1]}')


Number of publications: 1578
Number of features in df: 16


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  publication_filtered['id'] = publication_filtered['id'].astype(str)


In [36]:
publication_filtered.head(30)
# german publication IDs
german_ids =['12238508',
             '12238593',
             '12238675',
             '12240680'] # here keywords englisch but abstract german

english_ids = ['10101316',
               '12237848']

# sample ids
sample_ids = german_ids + english_ids
sample_ids


# query sample_ids for column 'id'
sample_df = publication_filtered[publication_filtered['id'].isin(sample_ids)]

id                int64
cfTitle          object
cfUri            object
keywords         object
doi              object
srcAuthors       object
authors          object
cfAbstr          object
publYear          int64
eid              object
data_source      object
log              object
result_flag      object
error            object
error_doi        object
keyword_clean    object
dtype: object

## Detection
### Setup

In [89]:
# set seed to get reproducible results
langdetect.DetectorFactory.seed = 3141

# detect language of first item in sample_df
# print col names of debug msg
msg = f'ID + lang\t\tLang Abstract\tLand kws'
print(msg)
for index,row in sample_df.iterrows():
    # id of row
    id = row['id']
    # if id is in german_ids orange font in msg
    if id in german_ids:
        msg = f'\033[1;33m{id} (de) \033[0m \t'
    else:
        msg = f'{id} (en) \t'

    # detect lang of abstract and print probability at the end in brackets
    lang_abstract = langdetect.detect(row['cfAbstr'])
    prob_abstract = langdetect.detect_langs(row['cfAbstr'])[0].prob
    # round 4 digits after comma
    prob_abstract = round(prob_abstract,6)



    # if lang of abstract is not english print red
    if lang_abstract != 'en':
        msg += f'\033[91mAbstract: {lang_abstract}\t'
    else:
        msg += f'Abstract: {lang_abstract}\t'
    msg += f'({prob_abstract})\t'
    lang_keywords = langdetect.detect(row['keywords'])
    prob_keywords = langdetect.detect_langs(row['keywords'])[0].prob
    prob_keywords = round(prob_keywords,6)

    if lang_keywords != 'en':
        msg += f'\033[91mKeywords: {lang_keywords}\033[0m'
    else:
        msg += f'Keywords: {lang_keywords}'
    msg += f' ({prob_keywords})\t'
    print(msg)

ID + lang		Lang Abstract	Land kws
10101316 (en) 	Abstract: en	(0.999996)	Keywords: en (0.999997)	
12237848 (en) 	Abstract: en	(0.999997)	Keywords: en (0.857139)	
[1;33m12238508 (de) [0m 	[91mAbstract: de	(0.999997)	[91mKeywords: de[0m (0.85714)	
[1;33m12238593 (de) [0m 	[91mAbstract: de	(0.999996)	[91mKeywords: de[0m (0.999997)	
[1;33m12238675 (de) [0m 	[91mAbstract: de	(0.999996)	[91mKeywords: it[0m (0.562428)	
[1;33m12240680 (de) [0m 	Abstract: en	(0.999997)	Keywords: en (0.999994)	


In [None]:
# get row  with 12240680
row = sample_df[sample_df['id'] == '12240680']
# detect langs of title
from langdetect import detect_langs
# detect langs from title
title_text = row['cfAbstr'].values[0]
detect_langs(title_text)
detect_langs('')
