In [1]:
!pip install numpy scipy gensim googletrans fitz pypdf pandas pyldavis nltk spacy

Collecting googletrans
  Downloading googletrans-4.0.2-py3-none-any.whl.metadata (10 kB)
Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting scipy
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting configobj (from fitz)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.3-py3-none-any.whl.metadata (5.4 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.1-py3-none-any.whl.metadata (3.6 kB)
Collecting rdflib>=5.0.0 (from nipype->fitz)
  Downloading rdflib-7.1.4

In [2]:
!pip install --upgrade googletrans



In [3]:
import os
from pypdf import PdfReader
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
import random
import numpy as np
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.model_selection import train_test_split
import pandas as pd

#setting random seeds for reproducibility
random.seed(42)
np.random.seed(42)

# Attempt to import Googletrans; if unavailable, skip translation
try:
    from googletrans import Translator
    translator = Translator()
    do_translate = True
    print("googletrans imported; will translate non-English texts.")
except ImportError:
    print("googletrans not installed; skipping translation step.")
    translator = None
    do_translate = False

# 1. Extract text from all PDFs under 'downloads'
root_dir = '/kaggle/input/strategies-new/downloads'

googletrans imported; will translate non-English texts.


In [4]:
documents = []

for dirpath, _, filenames in os.walk(root_dir):
    for fname in sorted(filenames): #sorting to ensure same order each time
        if fname.lower().endswith('.pdf'):
            full_path = os.path.join(dirpath, fname)
            # Extract country from directory structure
            rel_path = os.path.relpath(full_path, root_dir)
            parts = rel_path.split(os.sep)
            country = parts[0] if len(parts) > 1 else 'Unknown'
            #file_paths.append(full_path)
            print(country, fname)
            try:
                doc = PdfReader(full_path)
                print(len(doc.pages))
                text = ""
                for page in doc.pages:
                    try:
                        text += page.extract_text()
                    except:
                        print(fname + "failed to fully extract")
                documents.append({
                            'country': country,
                            'filename': fname,
                            'text': text
                        })
                doc.close()
            except:
                print("Could not read" + fname)

Korea Korea_National_Strategy_for_Artificial_Intelligence_2019.pdf
62
Switzerland DIGITAL_SWITZERLAND_STRATEGY_Strategie-DS-2020-EN.pdf
4
Kenya Kenya-Digital-Economy-2019.pdf
96
Kenya National AI Plan-Kenya_Emerging_Digital_Technologies.pdf
128
Hungary 2020-hungarian-AI-strategy.pdf
58
Hungary AI_ACTION_PLAN_e8dd79bd380a40c9890dd2fb01dd771b.pdf
58
Greece Gen_AI_Greece_EN_s.pdf
147
Greece Greece AI strategy.pdf
154
African Union 44004-doc-EN-_Continental_AI_Strategy_July_2024.pdf
66
T__rkiye 2023_INDUSTRY_AND_TECHNOLOGY_STRATEGY_sts-ktp.pdf
Could not read2023_INDUSTRY_AND_TECHNOLOGY_STRATEGY_sts-ktp.pdf
T__rkiye Turkey_National_Artificial_Intelligence_Strategy_2021-2025.pdf
50
Ireland FUTURE_JOBS_IRELAND_Future-Jobs-Ireland-2019.pdf
125
Ireland NATIONAL_AI_STRATEGY_National-AI-Strategy.pdf
74
Singapore NATIONAL_AI_STRATEGY_national-ai-strategy.pdf
45
Italy AI_STRATEGIC_PROGRAMME_1637937177-programma-strategico-iaweb-2.pdf
40
Luxembourg AI__A_STRATEGIC_VISION_FOR_LUXEMBOURG_AI_EN_0.pdf
2

In [5]:
df_documents = pd.DataFrame(documents)
df_documents.tail()

Unnamed: 0,country,filename,text
128,China,t0390_trustworthy_AI_EN.pdf,\nTranslation \n \nThe following white paper ...
129,Germany,220525_BMZ-Factsheet_EN_FAIR-Forward.pdf,\n \nFAIR Forward – Artificial Intelligence f...
130,Germany,National AI Strategy.pdf,www.ki-strategie-deutschland.de \nArtificial I...
131,Bulgaria,CONCEPT_FOR_THE_DEVELOPMENT_OF_AI_IN_BULGARIA_...,1 \nREPUBLIC OF BULGARIA \nMINISTRY OF TRANSPO...
132,Ukraine,Ukraine_National_Strategy_for_Development_of_A...,Зат...


In [6]:
df_documents = df_documents.applymap(lambda x: x.encode('unicode_escape').
                 decode('utf-8') if isinstance(x, str) else x)
df_documents.to_excel("Untranslated_docs1.xlsx", index=False,header=True)
try:
    df_documents.to_csv("Untranslated_docs1.csv", index=False,header=True)
except:
    pass

  df_documents = df_documents.applymap(lambda x: x.encode('unicode_escape').


In [7]:
#df_documents = pd.read_excel("/kaggle/input/untranslated-docs/Untranslated_docs1.xlsx")
#df_documents.head()

In [8]:
import textwrap
import asyncio

async def translate_long_text(text, dest="en", chunk_size=4000):
    """
    Splits `text` into chunks of at most `chunk_size` characters,
    translates each chunk, and returns the rejoined translated text.
    """
    # textwrap.wrap will split on whitespace boundaries
    chunks = textwrap.wrap(text, chunk_size)
    translated_chunks = []
    for chunk in chunks:
        # translate() both detects and translates
        async with Translator() as translator:
            translated = await translator.translate(chunk, dest=dest)
            if translated.src == "en":
                return text # return original text if english
            translated_chunks.append(translated.text)
    return " ".join(translated_chunks)

# 2. Translate to English if possible
translated_documents = []
for doc in documents:
    if do_translate:
        print(doc['filename'])
        translation = await translate_long_text(doc['text'])
        doc['text'] = translation
        print('done')
        #except:
            #print("Translation failed for " + doc['country'] + " , " + doc['filename'])
        # otherwise, leave entry['text'] as-is
        translated_documents.append(doc)

Korea_National_Strategy_for_Artificial_Intelligence_2019.pdf
done
DIGITAL_SWITZERLAND_STRATEGY_Strategie-DS-2020-EN.pdf
done
Kenya-Digital-Economy-2019.pdf
done
National AI Plan-Kenya_Emerging_Digital_Technologies.pdf
done
2020-hungarian-AI-strategy.pdf
done
AI_ACTION_PLAN_e8dd79bd380a40c9890dd2fb01dd771b.pdf
done
Gen_AI_Greece_EN_s.pdf
done
Greece AI strategy.pdf
done
44004-doc-EN-_Continental_AI_Strategy_July_2024.pdf
done
Turkey_National_Artificial_Intelligence_Strategy_2021-2025.pdf
done
FUTURE_JOBS_IRELAND_Future-Jobs-Ireland-2019.pdf
done
NATIONAL_AI_STRATEGY_National-AI-Strategy.pdf
done
NATIONAL_AI_STRATEGY_national-ai-strategy.pdf
done
AI_STRATEGIC_PROGRAMME_1637937177-programma-strategico-iaweb-2.pdf
done
AI__A_STRATEGIC_VISION_FOR_LUXEMBOURG_AI_EN_0.pdf
done
BIG_DATA_ANALYSIS_uppdrag-att-kartlagga-anvandningen-av-artificiell-intelligens-respektive-analys-av-stora-datamangder-i-sverige.pdf
done
DIGITAL_EXCELLENCE_uppdrag-att-samverka-kring-kompetensforsorjningen-av-digital-sp

In [9]:
df_translated_documents = pd.DataFrame(translated_documents)
df_translated_documents = df_translated_documents.applymap(lambda x: x.encode('unicode_escape').
                 decode('utf-8') if isinstance(x, str) else x)
df_translated_documents.to_excel("translated_docs3.xlsx", index=False,header=True)
try:
    df_translated_documents.to_csv("translated_docs3.csv", index=False,header=True)
except:
    pass
df_translated_documents

  df_translated_documents = df_translated_documents.applymap(lambda x: x.encode('unicode_escape').


Unnamed: 0,country,filename,text
0,Korea,Korea_National_Strategy_for_Artificial_Intelli...,GPRN\n11-1721000-000393-01\nNational Strategy ...
1,Switzerland,DIGITAL_SWITZERLAND_STRATEGY_Strategie-DS-2020...,Digital \nSwitzerland \nStrategy \n2023 \u2...
2,Kenya,Kenya-Digital-Economy-2019.pdf,DIGITAL ECONOMY \nBLUEPRINT\nPOWERING KENYA\u2...
3,Kenya,National AI Plan-Kenya_Emerging_Digital_Techno...,Acronyms.........................................
4,Hungary,2020-hungarian-AI-strategy.pdf,1\nHungary\u2019s Artificial \nIntelligence St...
...,...,...,...
128,China,t0390_trustworthy_AI_EN.pdf,\nTranslation \n \nThe following white paper ...
129,Germany,220525_BMZ-Factsheet_EN_FAIR-Forward.pdf,\n \nFAIR Forward \u2013 Artificial Intellige...
130,Germany,National AI Strategy.pdf,www.ki-strategie-deutschland.de \nArtificial I...
131,Bulgaria,CONCEPT_FOR_THE_DEVELOPMENT_OF_AI_IN_BULGARIA_...,1 \nREPUBLIC OF BULGARIA \nMINISTRY OF TRANSPO...


In [10]:
italian = df_translated_documents[df_translated_documents["country"] == 'Italy']
italian

Unnamed: 0,country,filename,text
13,Italy,AI_STRATEGIC_PROGRAMME_1637937177-programma-st...,Strategic program of artificial intelligence 2...


In [11]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [12]:
#preparing for LDA

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from unidecode import unidecode
from nltk.corpus import words

english_vocab = set(words.words())

def remove_unicode(text: str) -> str:
    """
    Transliterate *all* Unicode to ASCII (e.g. ń → n, 𝐢 → i, ü → u),
    then remove any leftover non‐printable characters and collapse whitespace.
    """
    # 1. Unidecode: full transliteration
    text_ascii = unidecode(text)
    # 2. Remove any remaining non-ASCII (should be rare now)
    text_ascii = text_ascii.encode("ascii", "ignore").decode("ascii")
    # 3. Strip control/non-printable characters
    text_ascii = re.sub(r"[^\x20-\x7E]", " ", text_ascii)
    # 4. Collapse whitespace
    text_ascii = re.sub(r"\s+", " ", text_ascii).strip()
    return text_ascii

def remove_country_name(text: str, country: str) -> str:
    """
    Remove occurrences of the country name (word boundary) from the text, case-insensitive.
    """
    pattern = r'\b{}\b'.format(re.escape(country))
    cleaned = re.sub(pattern, '', text, flags=re.IGNORECASE)
    # Collapse whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

def fully_clean(text: str) -> str:
    # 1. Decode any literal Python/JSON-style escapes (\n, \u2019, etc.)
    try:
        text = text.encode('utf-8').decode('unicode_escape')
    except Exception:
        # if it fails (e.g. because it's already real newlines), just carry on
        pass

    # 2. Now replace real newlines/tabs with spaces
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

    # 3. Transliterate everything to ASCII (ń→n, 𝐢→i, ’→')
    text = unidecode(text)

    # 4. Drop any remaining non-printable or non-ASCII (just in case)
    text = re.sub(r'[^\x20-\x7E]', ' ', text)

    # 5. Collapse multiple spaces into one
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def clean_text(text):
    #text = text.replace('\r\n', '')
    text = text.replace('\n', '')  # Newline
    text = text.replace('\r', '')  # Carriage return
    text = text.replace('\t', '')  # Tab
    text = text.replace('\b', '')  # Backspace
    text = text.replace('\f', '')  # Form feed
    text = text.replace('\a', '')  # Alert sound
    text = text.replace('\\', '')  # Literal backslash
    #text = text.replace('\\n', '')  # Newline
    #text = text.replace('\\r', '')  # Carriage return
    #text = text.replace('\\t', '')  # Tab
    #text = text.replace('\\b', '')  # Backspace
    #text = text.replace('\\f', '')  # Form feed
    #text = text.replace('\\a', '')  # Alert sound
    #text = text.replace('\\\\', '')  # Literal backslash
    #text = re.sub(r'\S+@\S+', '', text)
    #text = re.sub(r'http\S+', '', text)
    #text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.strip().lower()

stop_words = set(stopwords.words('english'))
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return tokens

nlp = spacy.load('en_core_web_sm')
def lemmatize(tokens):
    doc = nlp(' '.join(tokens))
    return [token.lemma_ for token in doc if token.pos_ in ['NOUN', 'ADJ', 'VERB']]

from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
import spacy

# 1. Ensure stopwords are available
nltk.download('stopwords')

# 2. Load spaCy model
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# 3. Prepare stopword set
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
#df_translated_documents['text'] = clean_text(df_translated_documents['text'])
#df_translated_documents['text'] = preprocess(df_translated_documents['text'])
#df_translated_documents['text'] = lemmatize(df_translated_documents['text'])
def preprocess_text(text):
    # Tokenize, lowercase, remove punctuation
    text = clean_text(text)
    tokens = simple_preprocess(text, deacc=True)
    # Remove stopwords
    tokens_nostop = [tok for tok in tokens if tok not in stop_words and len(tok) > 3]
    # Lemmatize with spaCy
    doc = nlp(" ".join(tokens_nostop))
    
    lemmas = [
        token.lemma_ for token in doc
        if token.lemma_.isalpha() and token.lemma_ not in stop_words
    ]
    return lemmas

df_translated_documents['tokens'] = df_translated_documents['text'].apply(fully_clean)
df_translated_documents['tokens'] = df_translated_documents.apply(lambda row: remove_country_name(row['tokens'], row['country']), axis=1)
df_translated_documents['tokens'] = df_translated_documents['tokens'].apply(preprocess_text)
df_translated_documents['tokens'] = df_translated_documents['tokens'].apply(lambda toks: [t for t in toks if t in english_vocab]) # remove remaining non-english / random characters
df_translated_documents.head()



Unnamed: 0,country,filename,text,tokens
0,Korea,Korea_National_Strategy_for_Artificial_Intelli...,GPRN\n11-1721000-000393-01\nNational Strategy ...,"[national, strategy, artificial, intelligence,..."
1,Switzerland,DIGITAL_SWITZERLAND_STRATEGY_Strategie-DS-2020...,Digital \nSwitzerland \nStrategy \n2023 \u2...,"[digital, strategy, purpose, digital, strategy..."
2,Kenya,Kenya-Digital-Economy-2019.pdf,DIGITAL ECONOMY \nBLUEPRINT\nPOWERING KENYA\u2...,"[digital, economy, blueprint, power, transform..."
3,Kenya,National AI Plan-Kenya_Emerging_Digital_Techno...,Acronyms.........................................,"[acronym, distribute, technology, artificial, ..."
4,Hungary,2020-hungarian-AI-strategy.pdf,1\nHungary\u2019s Artificial \nIntelligence St...,"[artificial, intelligence, strategy, table, co..."


In [14]:
df_translated_documents.to_excel('final_docs_english.xlsx',index=False)
try:
    df_translated_documents.to_csv('final_docs_english.csv',index=False)
except:
    pass

In [15]:
import gensim
# 4. Build Gensim dictionary and corpus
dictionary = gensim.corpora.Dictionary(df_translated_documents['tokens'])
dictionary.filter_extremes(no_below=2, no_above=0.5)
corpus = [dictionary.doc2bow(tokens) for tokens in df_translated_documents['tokens']]

# 5. Split corpus for held-out perplexity
train_corpus, heldout_corpus = train_test_split(corpus, test_size=0.05, random_state=42)

# 6. Grid search over LdaModel hyperparameters with coherence & perplexity
results = []
best_coherence = -np.inf
best_model = None
best_params = {}
alpha='auto'

for num_topics in [5, 10, 15, 20, 25]:
    for passes in [10, 20, 30, 35, 40]:
        for iterations in [25,50,100]:
            print("Training model for ",num_topics," topics")
            model = LdaModel(
                corpus=train_corpus,
                id2word=dictionary,
                num_topics=num_topics,
                alpha='auto',
                eta='auto',
                passes=passes,
                iterations=iterations,
                random_state=42
            )
            coherence = CoherenceModel(
                model=model,
                texts=df_translated_documents['tokens'],
                dictionary=dictionary,
                coherence='c_v'
            ).get_coherence()
            perplexity = model.log_perplexity(heldout_corpus)
            results.append({
                'num_topics': num_topics,
                'passes' : passes,
                'iterations' : iterations,
                'alpha': model.alpha,
                'eta' : model.eta,
                'coherence': coherence,
                'perplexity': perplexity
            })
            print(results[-1])
            for i in range(model.num_topics):
                terms = [term for term, weight in model.show_topic(i, topn=20)]
                print(f"Topic {i+1}: {', '.join(terms)}")
            if coherence > best_coherence:
                best_coherence = coherence
                best_model = model
                best_params = {
                    'num_topics': num_topics,
                    'passes' : passes,
                    'iterations' : iterations,
                    'alpha': model.alpha,
                    'eta': model.eta,
                    'coherence': coherence,
                    'perplexity': perplexity
                }


Training model for  5  topics
{'num_topics': 5, 'passes': 10, 'iterations': 25, 'alpha': array([0.2594895 , 0.25796434, 0.17583571, 0.1207733 , 0.23211482],
      dtype=float32), 'eta': array([0.25123248, 0.7500466 , 0.33997312, ..., 0.168633  , 0.1938725 ,
       0.16982293], dtype=float32), 'coherence': 0.3216970510977313, 'perplexity': -10.880162379974557}
Topic 1: look, white, trustworthy, indicator, competence, pillar, domain, digitalization, scheme, ambition, twin, climate, board, customer, phase, directive, mobile, reuse, meld, specialist
Topic 2: trustworthy, figure, mission, core, analytic, leverage, section, bias, generative, revolution, domain, incentive, foster, teacher, outcome, imperative, intervention, china, firm, release
Topic 3: font, dutch, align, vertical, style, inherit, climate, deliver, labour, transportation, law, ambition, food, land, cent, affair, worker, road, white, house
Topic 4: recovery, resilience, reform, gender, woman, labor, productive, digitalization

In [16]:
# 7. Display grid search results
#import ace_tools as tools
df_results = pd.DataFrame(results)
#tools.display_dataframe_to_user(name="LDA Grid Search Results", dataframe=df_results)

# 8. Show best model parameters and top topic terms
print("Best Model Parameters:", best_params)
for i in range(best_model.num_topics):
    terms = [term for term, weight in best_model.show_topic(i, topn=20)]
    print(f"Topic {i+1}: {', '.join(terms)}")

Best Model Parameters: {'num_topics': 25, 'passes': 35, 'iterations': 25, 'alpha': array([0.07683099, 0.08592391, 0.06523217, 0.0256325 , 0.04719341,
       0.09355737, 0.05173255, 0.07122044, 0.06658748, 0.14517432,
       0.03323941, 0.0318361 , 0.0695321 , 0.03847483, 0.03414286,
       0.01390761, 0.02049089, 0.01687363, 0.0277596 , 0.06054216,
       0.03697119, 0.06974328, 0.01527013, 0.11631148, 0.0163552 ],
      dtype=float32), 'eta': array([0.04046866, 0.04673168, 0.04380894, ..., 0.03726417, 0.03738008,
       0.03726422], dtype=float32), 'coherence': 0.4140096073723097, 'perplexity': -18.729023780435114}
Topic 1: ambition, board, chair, mila, phase, covid, child, climate, vector, deploy, director, career, alliance, deliver, engagement, worker, credit, professor, confidence, asset
Topic 2: imperative, section, mission, foster, leverage, incentive, defence, procurement, cultural, figure, climate, metric, teacher, literacy, inclusive, bias, intervention, expenditure, mitigate,

In [17]:
df_results.to_excel('results.xlsx',index=False)
try:
    df_results.to_csv('results.csv',index=False)
except:
    pass

In [18]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# 1. Compute full document-topic distributions
doc_topic_probs = []
for bow in corpus:
    # Get probability for every topic in the model
    topic_dist = dict(best_model.get_document_topics(bow, minimum_probability=0))
    doc_topic_probs.append(topic_dist)

# 2. Convert to DataFrame
topic_df = pd.DataFrame(doc_topic_probs).fillna(0)
# Rename columns to topic_0, topic_1, ...
topic_df.columns = [f"topic_{i}" for i in topic_df.columns]

# 3. Combine with metadata
output_df = pd.concat(
    [df_documents[['country', 'filename']].reset_index(drop=True), topic_df],
    axis=1
)

output_df.to_excel("topic_dist_per_doc.xlsx", index=False)
output_df.to_csv("topic_dist_per_doc.csv", index=False)

In [19]:
vis_data = gensimvis.prepare(best_model, corpus, dictionary)
pyLDAvis.enable_notebook()

vis_data