# Language Filtering Script

In [None]:
# installations

import pandas as pd

In [12]:
# read in the files
grants = pd.read_csv('data/grants.csv')

In [13]:
non_strings = grants[~grants['Abstract/Summary'].apply(lambda x: isinstance(x, str))]
print("Non-string rows in 'Abstract/Summary':")
print(non_strings)


Non-string rows in 'Abstract/Summary':
                                          Project title  \
199   Causal risk factors for sexual offending: Evid...   
1623                            Summer Program in Aging   
2337  Population health interventions in chronic dis...   
2618  Connectivity of older adults in rural communit...   
2724                                     not applicable   
2771  Redefining the phenotype in complex diseases t...   
3519  Neonatal pain-related stress, brain microstruc...   
4403  The CONNECT Program: Engaging Community Organi...   
5394  Summer Program in Aging 2023 Application: Long...   
5944  Models for Primary Medical Care, Nursing Care ...   
6272  Infrastructure development and capacity buildi...   

     Principal investigator(s)   Co-investigator(s)  \
199    ['Babchishin, Kelly M']              ['N/A']   
1623    ['Correia, Rebecca H']              ['N/A']   
2337      ['Hammond, David G']              ['N/A']   
2618      ['Keating, Norah C']  

In [None]:
grants['Abstract/Summary'] = grants['Abstract/Summary'].fillna('').astype(str)

In [15]:
def safe_detect(text):
    try:
        return detect(text)
    except:
        return 'unknown'  # Mark rows where detection fails

# Apply language detection
grants['lang'] = grants['Abstract/Summary'].apply(safe_detect)

# Count occurrences of each detected language
language_counts = grants['lang'].value_counts()

print("Language Counts:")
print(language_counts)

Language Counts:
lang
en         6247
fr          355
unknown      11
hr            6
tl            1
Name: count, dtype: int64


In [None]:
# Filter for unknown, hr, and tl
unusual_langs = grants[grants['lang'].isin(['unknown', 'hr', 'tl'])]

# Display the rows
print(unusual_langs[['Abstract/Summary', 'lang']])

     Abstract/Summary     lang
199                    unknown
1411       sans objet       hr
1623                   unknown
1654       sans objet       hr
2337                   unknown
2386       sans objet       hr
2618                   unknown
2724                   unknown
2752              N/a       tl
2771                   unknown
2779       sans objet       hr
2936       sans objet       hr
3519                   unknown
4403                   unknown
5394                   unknown
5944                   unknown
6238       sans objet       hr
6272                   unknown


In [17]:
# Filter for English abstracts only
english_grants = grants[grants['lang'] == 'en']

# Save to a CSV file
english_grants.to_csv('data/english_grants.csv', index=False)

print("Filtered dataset saved as 'english_grants.csv'.")

Filtered dataset saved as 'english_grants.csv'.
