In [2]:
import pandas as pd
import jieba
import re
from collections import Counter

df = pd.read_csv('/content/projects_with_region.csv', usecols=['title', 'objective'], encoding='utf-8')
texts = df['title'].fillna('') + ' ' + df['objective'].fillna('')
all_tokens = []
for text in texts:
    cleaned = re.sub(r'\d+', '', text)
    cleaned = re.sub(r'[^\u4e00-\u9fa5\w\s]', '', cleaned)
    segments = jieba.lcut(cleaned)
    all_tokens.extend([token for token in segments if len(token) > 1])
freq = Counter(all_tokens)
top200 = [word for word, _ in freq.most_common(200)]
with open('stopwords.txt', 'w', encoding='utf-8') as fout:
    for word in top200:
        fout.write(word + '\n')
print(f"Generated field-specific stopword list 'stopwords.txt' with {len(top200)} entries.")
print("Sample of the first 20 stopwords:", top200[:20])


Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.622 seconds.
DEBUG:jieba:Loading model cost 0.622 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


Generated field-specific stopword list 'stopwords.txt' with 200 entries.
Sample of the first 20 stopwords: ['and', 'the', 'of', 'to', 'in', 'will', 'for', 'is', 'with', 'on', 'The', 'by', 'that', 'as', 'be', 'are', 'project', 'from', 'this', 'an']


In [4]:
import pandas as pd
import jieba
import re
from collections import Counter
from concurrent.futures import ProcessPoolExecutor

INPUT_CSV = '/content/projects_with_region.csv'
STOPWORDS_TXT = '/content/stopwords.txt'
OUTPUT_CSV = '/content/projects_with_region_preprocessed.csv'
N_WORKERS = 4
MIN_FREQ = 5

print("1/5 ▶ Loading CSV …")
df = pd.read_csv(
    INPUT_CSV,
    usecols=['title', 'objective'],
    encoding='utf-8',
    engine='python',
    on_bad_lines='skip'
)
texts = (df['title'].fillna('') + ' ' + df['objective'].fillna('')).tolist()

print("2/5 ▶ Loading stopwords & compiling regex …")
re_num = re.compile(r'\d+')
re_punc = re.compile(r'[^\u4e00-\u9fa5\w\s]')
with open(STOPWORDS_TXT, 'r', encoding='utf-8') as f:
    stopwords = set(line.strip() for line in f if line.strip())

def init_worker():
    import jieba

def preprocess(text):
    t = re_num.sub('', text)
    t = re_punc.sub('', t)
    tokens = [w for w in jieba.lcut(t) if len(w) > 1 and w not in stopwords]
    return tokens

print(f"3/5 ▶ Tokenizing in parallel ({N_WORKERS} workers) …")
if __name__ == '__main__':
    with ProcessPoolExecutor(max_workers=N_WORKERS, initializer=init_worker) as executor:
        tokens_list = list(executor.map(preprocess, texts))

    df['tokens'] = tokens_list

    print("4/5 ▶ Filtering low-frequency tokens …")
    all_tokens = [w for seq in tokens_list for w in seq]
    freq = Counter(all_tokens)
    df['tokens'] = df['tokens'].apply(lambda seq: [w for w in seq if freq[w] >= MIN_FREQ])

    print("5/5 ▶ Saving results …")
    df.to_csv(OUTPUT_CSV, index=False, encoding='utf-8')
    print(f"Done! Preprocessed data saved to: {OUTPUT_CSV}")


1/5 ▶ Loading CSV …
2/5 ▶ Loading stopwords & compiling regex …
3/5 ▶ Tokenizing in parallel (4 workers) …
4/5 ▶ Filtering low-frequency tokens …
5/5 ▶ Saving results …
Done! Preprocessed data saved to: /content/projects_with_region_preprocessed.csv


In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

df = pd.read_csv(
    '/content/projects_with_region_preprocessed.csv',
    converters={'tokens': eval},
    encoding='utf-8'
)

docs = df['tokens'].apply(lambda lst: ' '.join(lst)).tolist()

vectorizer = CountVectorizer(
    max_df=0.9,
    min_df=5,
    max_features=5000
)
X = vectorizer.fit_transform(docs)
print(f"Document-term matrix: {X.shape[0]} documents, {X.shape[1]} terms.")

n_topics = 10
lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=20,
    learning_method='batch',
    random_state=42
)
lda.fit(X)

terms = vectorizer.get_feature_names_out()
topn = 10

for idx, comp in enumerate(lda.components_):
    top_terms = [terms[i] for i in comp.argsort()[-topn:]][::-1]
    print(f"Topic {idx+1}:", " | ".join(top_terms))

doc_topic = lda.transform(X)
topic_cols = [f"topic_{i+1}" for i in range(n_topics)]
topic_df = pd.DataFrame(doc_topic, columns=topic_cols)

out = pd.concat([df, topic_df], axis=1)
out.to_csv('/content/projects_with_region_with_topics.csv', index=False, encoding='utf-8')
print("Saved preprocessed data with topic distributions to '/content/projects_with_region_with_topics.csv'")


Document-term matrix: 15221 documents, 5000 terms.
Topic 1: cultural | history | heritage | historical | modern | early | th | were | what | practices
Topic 2: optical | devices | light | chemical | chemistry | synthesis | material | molecules | magnetic | organic
Topic 3: diseases | drug | therapy | therapeutic | patient | immune | tissue | medical | risk | care
Topic 4: theory | computational | algorithms | machine | neural | problems | processing | networks | artificial | cognitive
Topic 5: physics | earth | dynamics | water | evolution | space | ocean | formation | modelling | experiments
Topic 6: storage | hydrogen | efficiency | performance | battery | heat | safety | cost | solution | integration
Topic 7: species | brain | genetic | cellular | proteins | biology | protein | function | functional | changes
Topic 8: stakeholders | services | ecosystem | researchers | consortium | projects | countries | research | international | implementation
Topic 9: food | waste | products | va

In [6]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

PREPRO_CSV = '/content/projects_with_region_preprocessed.csv'
TOPICS_CSV = '/content/projects_with_region_with_topics.csv'
N_TOPICS = 10

df_tokens = pd.read_csv(PREPRO_CSV, converters={'tokens': eval}, encoding='utf-8')
docs = df_tokens['tokens'].apply(lambda lst: ' '.join(lst)).tolist()

vectorizer = CountVectorizer(max_df=0.9, min_df=10, max_features=5000)
X = vectorizer.fit_transform(docs)

lda = LatentDirichletAllocation(n_components=N_TOPICS, max_iter=20, learning_method='batch', random_state=42)
lda.fit(X)

print("LDA perplexity:", lda.perplexity(X))

doc_topic = lda.transform(X)
df_tokens['dominant_topic'] = doc_topic.argmax(axis=1) + 1

df_tokens.to_csv(TOPICS_CSV, index=False, encoding='utf-8')
print(f"Saved to: {TOPICS_CSV}")

for t in range(1, N_TOPICS + 1):
    subset = df_tokens[df_tokens['dominant_topic'] == t]
    sample = subset.sample(5, random_state=42) if len(subset) >= 5 else subset
    print(f"\n--- Topic {t} sample documents (total {len(subset)}) ---")
    for _, row in sample.iterrows():
        title = row.get('title', '')
        print(f"- {title[:50]}...")

topic_counts = df_tokens['dominant_topic'].value_counts().sort_index()
print("\nDocument counts by topic:")
print(topic_counts)


LDA perplexity: 2238.6578303807937
Saved to: /content/projects_with_region_with_topics.csv

--- Topic 1 sample documents (total 2094) ---
- Antivirus Pandemic Preparedness EuropeAn pLatform...
- Bacteria Intrinsically Orchestrated with Designed ...
- The hydrocup: a hollow electrospun scaffold for in...
- Mechanisms Linking Early-Life Stress and Resilienc...
- Unravelling the mechanisms for recruitment and act...

--- Topic 2 sample documents (total 685) ---
- New Users for a Better ICOS...
- Integration and Digital Demonstration of Low-emiss...
- Boosting the exposome space coverage in the aquati...
- Aerosols, Convection, Clouds, and Climate Sensitiv...
- enhanCed glObaL quantificatiOn and underStanding o...

--- Topic 3 sample documents (total 1602) ---
- SYstemic Mobilisation for Joint Biodiversity and I...
- Advanced multimodal marketplace for low emission a...
- SpAce-AIr-Ground Last Mile InfRastructure & Dynami...
- Artificial Intelligence for the European Open Scie...
- Artific

In [7]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF

INPUT_PREPROCESSED = '/content/projects_with_region_preprocessed.csv'
STOPWORDS_TXT = '/content/stopwords.txt'
OUTPUT_TOPICS_CSV = '/content/nmf_document_topics.csv'

N_TOPICS = 10
MAX_DF = 0.9
MIN_DF = 10
MAX_FEATURES = 5000

df = pd.read_csv(
    INPUT_PREPROCESSED,
    converters={'tokens': eval},
    encoding='utf-8'
)
docs = df['tokens'].apply(lambda tokens: ' '.join(tokens)).tolist()

with open(STOPWORDS_TXT, 'r', encoding='utf-8') as f:
    file_stopwords = [line.strip() for line in f if line.strip()]

stopwords = list(set(file_stopwords) | set(ENGLISH_STOP_WORDS))

vectorizer = TfidfVectorizer(
    max_df=MAX_DF,
    min_df=MIN_DF,
    max_features=MAX_FEATURES,
    stop_words=stopwords
)
X = vectorizer.fit_transform(docs)
print(f"Matrix size: {X.shape[0]} documents × {X.shape[1]} features")

nmf = NMF(
    n_components=N_TOPICS,
    init='nndsvda',
    random_state=42,
    max_iter=200
)
W = nmf.fit_transform(X)
H = nmf.components_

feature_names = vectorizer.get_feature_names_out()
for idx, topic_vector in enumerate(H, start=1):
    top_indices = topic_vector.argsort()[-10:][::-1]
    top_terms = [feature_names[i] for i in top_indices]
    print(f"Topic {idx:>2}:", " | ".join(top_terms))

topic_columns = [f"topic_{i}" for i in range(1, N_TOPICS + 1)]
df_topics = pd.DataFrame(W, columns=topic_columns)
df_out = pd.concat([df, df_topics], axis=1)
df_out.to_csv(OUTPUT_TOPICS_CSV, index=False, encoding='utf-8')
print(f"Document-topic distribution saved to: {OUTPUT_TOPICS_CSV}")




Matrix size: 15221 documents × 5000 features
Topic  1: researchers | ri | excellence | international | skills | programme | university | countries | national | stakeholders
Topic  2: immune | drug | diseases | therapy | gene | therapeutic | protein | tumor | cellular | proteins
Topic  3: hydrogen | storage | battery | waste | heat | renewable | emissions | green | circular | manufacturing
Topic  4: physics | optical | theory | light | magnetic | devices | dynamics | matter | theoretical | experimental
Topic  5: biodiversity | marine | soil | species | ocean | ecosystems | ecosystem | carbon | conservation | ecological
Topic  6: services | security | software | computing | monitoring | intelligence | smart | infrastructure | networks | edge
Topic  7: political | cultural | history | heritage | women | historical | gender | practices | language | urban
Topic  8: food | waste | products | chain | plant | protein | value | supply | healthy | safety
Topic  9: brain | neural | neurons | neur

In [10]:

import pandas as pd

INPUT = '/content/nmf_document_topics.csv'
OUTPUT = '/content/projects_with_region_final_nmf_classification.csv'

TOPIC_LABELS_NMF = {
    1: "Security & Computing Networks",
    2: "Medical Therapy & Disease",
    3: "Renewable Energy & Circular Manufacturing",
    4: "Optical & Theoretical Physics",
    5: "Marine Ecology & Conservation",
    6: "International Research Programs",
    7: "Cultural & Political History",
    8: "Food Supply & Safety",
    9: "Neuroscience & Brain Imaging",
    10: "Water Quality & Urban Pollution"
}

def main():
    df = pd.read_csv(INPUT, encoding='utf-8')
    topic_cols = [c for c in df.columns if c.startswith('topic_')]
    if not topic_cols:
        raise ValueError(f"No topic_ columns found in {INPUT}, columns: {df.columns.tolist()}")
    df['dominant_topic'] = df[topic_cols].idxmax(axis=1).str.replace('topic_', '').astype(int)
    df['classification'] = df['dominant_topic'].map(TOPIC_LABELS_NMF)
    df.to_csv(OUTPUT, index=False, encoding='utf-8')
    print(f"Saved to: {OUTPUT}")

if __name__ == "__main__":
    main()


Saved to: /content/projects_with_region_final_nmf_classification.csv


In [12]:

import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.stats import uniform, randint

def main():
    df = pd.read_csv('/content/projects_with_region_preprocessed.csv', converters={'tokens': eval}, encoding='utf-8')
    nmf = pd.read_csv('/content/projects_with_region_final_nmf_classification.csv', encoding='utf-8')
    df['text'] = df['tokens'].apply(lambda lst: ' '.join(lst))
    df['label'] = nmf['classification']

    X_train, X_test, y_train, y_test = train_test_split(
        df['text'], df['label'],
        test_size=0.2, random_state=42, stratify=df['label']
    )

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(solver='liblinear', class_weight='balanced', max_iter=500))
    ])

    param_dist = {
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'tfidf__min_df': randint(1, 10),
        'tfidf__max_df': uniform(0.6, 0.4),
        'clf__C': uniform(0.1, 5.0)
    }

    rand_search = RandomizedSearchCV(
        pipeline,
        param_distributions=param_dist,
        n_iter=15,
        cv=2,
        n_jobs=1,
        scoring='f1_macro',
        random_state=42,
        verbose=1
    )
    rand_search.fit(X_train, y_train)
    print(rand_search.best_params_)

    y_pred = rand_search.predict(X_test)
    print(classification_report(y_test, y_pred, digits=4))

    df['final_class'] = rand_search.predict(df['text'])
    df[['text', 'final_class']].to_csv(
        '/content/projects_with_region_final_supervised.csv',
        index=False, encoding='utf-8'
    )
    print('Saved to: /content/projects_with_region_final_supervised.csv')

if __name__ == '__main__':
    main()



Fitting 2 folds for each of 15 candidates, totalling 30 fits
{'clf__C': np.float64(1.6212112147976887), 'tfidf__max_df': np.float64(0.8099025726528951), 'tfidf__min_df': 9, 'tfidf__ngram_range': (1, 1)}
                                           precision    recall  f1-score   support

             Cultural & Political History     0.9384    0.9670    0.9525       394
                     Food Supply & Safety     0.9136    0.8132    0.8605        91
          International Research Programs     0.9330    0.9207    0.9268       454
            Marine Ecology & Conservation     0.9198    0.9155    0.9176       213
                Medical Therapy & Disease     0.9531    0.9778    0.9653       540
             Neuroscience & Brain Imaging     0.9720    0.9085    0.9392       153
            Optical & Theoretical Physics     0.9448    0.9520    0.9484       521
Renewable Energy & Circular Manufacturing     0.9385    0.9286    0.9335       378
            Security & Computing Networks     0.8

In [13]:
pip install imbalanced-learn




In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler
from scipy.stats import uniform, randint

def main():
    df = pd.read_csv('/content/projects_with_region_preprocessed.csv', converters={'tokens': eval}, encoding='utf-8')
    nmf = pd.read_csv('/content/projects_with_region_final_nmf_classification.csv', encoding='utf-8')
    df['text'] = df['tokens'].apply(lambda lst: ' '.join(lst))
    df['label'] = nmf['classification']
    X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])
    ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
    pipeline = ImbPipeline([
        ('tfidf', TfidfVectorizer()),
        ('ros', ros),
        ('clf', LogisticRegression(solver='liblinear', class_weight='balanced', max_iter=500))
    ])
    param_dist = {
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'tfidf__min_df': randint(1, 10),
        'tfidf__max_df': uniform(0.6, 0.4),
        'clf__C': uniform(0.1, 5.0)
    }
    rand_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=15, cv=2, n_jobs=1, scoring='f1_macro', random_state=42, verbose=1)
    rand_search.fit(X_train, y_train)
    print(rand_search.best_params_)
    y_pred = rand_search.predict(X_test)
    print(classification_report(y_test, y_pred, digits=4))
    df['final_class'] = rand_search.predict(df['text'])
    df[['text', 'final_class']].to_csv('/content/projects_with_region_final_supervised_balanced.csv', index=False, encoding='utf-8')
    print('/content/projects_with_region_final_supervised_balanced.csv')

if __name__ == '__main__':
    main()

Fitting 2 folds for each of 15 candidates, totalling 30 fits
{'clf__C': np.float64(1.6212112147976887), 'tfidf__max_df': np.float64(0.8099025726528951), 'tfidf__min_df': 9, 'tfidf__ngram_range': (1, 1)}
                                           precision    recall  f1-score   support

             Cultural & Political History     0.9404    0.9619    0.9511       394
                     Food Supply & Safety     0.8571    0.8571    0.8571        91
          International Research Programs     0.9345    0.9119    0.9231       454
            Marine Ecology & Conservation     0.8935    0.9061    0.8998       213
                Medical Therapy & Disease     0.9559    0.9630    0.9594       540
             Neuroscience & Brain Imaging     0.9286    0.9346    0.9316       153
            Optical & Theoretical Physics     0.9511    0.9328    0.9419       521
Renewable Energy & Circular Manufacturing     0.9307    0.9233    0.9270       378
            Security & Computing Networks     0.8

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def main():
    df_pre = pd.read_csv('/content/projects_with_region_preprocessed.csv', converters={'tokens': eval}, encoding='utf-8')
    df_lab = pd.read_csv('/content/projects_with_region_final_supervised_balanced.csv', encoding='utf-8')
    df_pre['text'] = df_pre['tokens'].apply(lambda lst: ' '.join(lst))
    df_pre['label'] = df_lab['final_class']
    X_train_text, X_test_text, y_train, y_test = train_test_split(
        df_pre['text'], df_pre['label'],
        test_size=0.2, random_state=42, stratify=df_pre['label']
    )
    vectorizer = TfidfVectorizer(max_df=0.81, min_df=9, ngram_range=(1,1))
    X_train = vectorizer.fit_transform(X_train_text)
    X_test = vectorizer.transform(X_test_text)
    clf = LogisticRegression(C=1.6212112147976887, solver='liblinear', class_weight='balanced', max_iter=500, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred, digits=4))
    X_all = vectorizer.transform(df_pre['text'])
    df_pre['fast_final_class'] = clf.predict(X_all)
    df_pre[['text', 'fast_final_class']].to_csv('/content/projects_with_region_fast_classification.csv', index=False, encoding='utf-8')
    print('/content/projects_with_region_fast_classification.csv')

if __name__ == '__main__':
    main()


                                           precision    recall  f1-score   support

             Cultural & Political History     0.9438    0.9723    0.9578       397
                     Food Supply & Safety     0.9213    0.8817    0.9011        93
          International Research Programs     0.9291    0.9062    0.9175       448
            Marine Ecology & Conservation     0.8964    0.9299    0.9128       214
                Medical Therapy & Disease     0.9478    0.9759    0.9617       540
             Neuroscience & Brain Imaging     0.9650    0.8961    0.9293       154
            Optical & Theoretical Physics     0.9398    0.9380    0.9389       516
Renewable Energy & Circular Manufacturing     0.9516    0.9390    0.9453       377
            Security & Computing Networks     0.8843    0.9304    0.9068       230
          Water Quality & Urban Pollution     0.9833    0.7763    0.8676        76

                                 accuracy                         0.9356      3045
  

In [18]:
import pandas as pd

orig = pd.read_csv(
    '/content/projects_with_region.csv',
    encoding='utf-8',
    engine='python',
    on_bad_lines='skip'
)

clas = pd.read_csv(
    '/content/projects_with_region_fast_classification.csv',
    encoding='utf-8'
)

assert len(orig) == len(clas), f"Row count mismatch: {len(orig)} vs {len(clas)}"

orig['final_class'] = clas['fast_final_class']

out = '/content/projects_with_region_with_classification.csv'
orig.to_csv(out, index=False, encoding='utf-8')
print(f"Saved to: {out}")


Saved to: /content/projects_with_region_with_classification.csv


In [19]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

def count_terms(docs, term_list):
    return np.array([
        sum(doc.lower().split().count(t) for t in term_list)
        for doc in docs
    ]).reshape(-1,1)

def main():
    df_pre = pd.read_csv('/content/projects_with_region_preprocessed.csv', converters={'tokens': eval}, encoding='utf-8')
    df_cls = pd.read_csv('/content/projects_with_region_fast_classification.csv', encoding='utf-8')
    df_pre['text'] = df_pre['tokens'].apply(lambda lst: ' '.join(lst))
    df_pre['label'] = df_cls['fast_final_class']
    X = df_pre['text']
    y = df_pre['label']
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    water_terms = ['water', 'pollution', 'wastewater', 'membrane', 'river', 'drinking']
    marine_terms = ['marine', 'biodiversity', 'ocean', 'ecosystem', 'species', 'soil']
    tfidf = TfidfVectorizer(max_df=0.81, min_df=9, ngram_range=(1,1))
    Xtr_tfidf = tfidf.fit_transform(X_tr)
    Xte_tfidf = tfidf.transform(X_te)
    Xtr_water = count_terms(X_tr, water_terms)
    Xtr_marine = count_terms(X_tr, marine_terms)
    Xte_water = count_terms(X_te, water_terms)
    Xte_marine = count_terms(X_te, marine_terms)
    Xtr = sparse.hstack([Xtr_tfidf, sparse.csr_matrix(Xtr_water), sparse.csr_matrix(Xtr_marine)])
    Xte = sparse.hstack([Xte_tfidf, sparse.csr_matrix(Xte_water), sparse.csr_matrix(Xte_marine)])
    clf = LogisticRegression(C=1.62, solver='liblinear', class_weight='balanced', max_iter=500, random_state=42)
    clf.fit(Xtr, y_tr)
    proba = clf.predict_proba(Xte)
    classes = clf.classes_.tolist()
    idx_wq = classes.index('Water Quality & Urban Pollution')
    idx_me = classes.index('Marine Ecology & Conservation')
    y_pred = clf.predict(Xte)
    mask = (proba[:, idx_me] > proba.max(axis=1)) & (proba[:, idx_wq] > 0.4)
    y_pred[mask] = 'Water Quality & Urban Pollution'
    print("=== Refined Classification Report ===")
    print(classification_report(y_te, y_pred, digits=4))
    cm = confusion_matrix(y_te, y_pred, labels=classes)
    print("=== Refined Confusion Matrix ===")
    print(pd.DataFrame(cm, index=classes, columns=classes))
    Xall_tfidf = tfidf.transform(X)
    Xall_water = count_terms(X, water_terms)
    Xall_marine = count_terms(X, marine_terms)
    Xall = sparse.hstack([Xall_tfidf, sparse.csr_matrix(Xall_water), sparse.csr_matrix(Xall_marine)])
    proba_all = clf.predict_proba(Xall)
    pred_all = clf.classes_[np.argmax(proba_all, axis=1)]
    mask_all = (proba_all[:, idx_me] > np.max(proba_all, axis=1)) & (proba_all[:, idx_wq] > 0.4)
    pred_all[mask_all] = 'Water Quality & Urban Pollution'
    df_pre['refined_pred'] = pred_all
    out = '/content/projects_with_region_refined_classification.csv'
    df_pre[['text', 'label', 'refined_pred']].to_csv(out, index=False, encoding='utf-8')
    print(f"Saved to: {out}")

if __name__ == "__main__":
    main()


=== Refined Classification Report ===
                                           precision    recall  f1-score   support

             Cultural & Political History     0.9518    0.9875    0.9693       400
                     Food Supply & Safety     0.9333    0.9032    0.9180        93
          International Research Programs     0.9582    0.9323    0.9451       443
            Marine Ecology & Conservation     0.8850    0.9302    0.9070       215
                Medical Therapy & Disease     0.9566    0.9724    0.9644       544
             Neuroscience & Brain Imaging     0.9857    0.9139    0.9485       151
            Optical & Theoretical Physics     0.9704    0.9516    0.9609       516
Renewable Energy & Circular Manufacturing     0.9571    0.9469    0.9520       377
            Security & Computing Networks     0.9507    0.9099    0.9298       233
          Water Quality & Urban Pollution     0.8182    0.9863    0.8944        73

                                 accuracy      

In [20]:
#!/usr/bin/env python3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

df_pre = pd.read_csv('/content/projects_with_region_preprocessed.csv', converters={'tokens': eval}, encoding='utf-8')
df_pre['text'] = df_pre['tokens'].apply(lambda lst: ' '.join(lst))
df_cls = pd.read_csv('/content/projects_with_region_fast_classification.csv', encoding='utf-8')
df_pre['label'] = df_cls['fast_final_class']

X = df_pre['text']
y = df_pre['label']
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

tfidf = TfidfVectorizer(max_df=0.81, min_df=9, ngram_range=(1,1))
Xtr = tfidf.fit_transform(X_tr)
Xte = tfidf.transform(X_te)

clf = LogisticRegression(C=1.62, solver='liblinear', class_weight='balanced', max_iter=500, random_state=42)
clf.fit(Xtr, y_tr)

proba = clf.predict_proba(Xte)
classes = clf.classes_.tolist()
i_wq = classes.index('Water Quality & Urban Pollution')
i_me = classes.index('Marine Ecology & Conservation')

best_t = 0
best_f1 = 0
for t in np.linspace(0.1, 0.9, 81):
    preds = clf.predict(Xte)
    mask = (proba[:, i_me] >= proba.max(axis=1)) & (proba[:, i_wq] > t)
    preds[mask] = 'Water Quality & Urban Pollution'
    f1 = f1_score(y_te, preds, labels=['Water Quality & Urban Pollution'], average='macro')
    if f1 > best_f1:
        best_f1 = f1
        best_t = t

print(f"Best threshold: {best_t:.2f}, tuned WQ F1: {best_f1:.4f}")

X_all = tfidf.transform(df_pre['text'])
proba_all = clf.predict_proba(X_all)
pred_all = clf.classes_[np.argmax(proba_all, axis=1)]
mask_all = (proba_all[:, i_me] >= np.max(proba_all, axis=1)) & (proba_all[:, i_wq] > best_t)
pred_all[mask_all] = 'Water Quality & Urban Pollution'

df_pre['refined_pred'] = pred_all
out = '/content/projects_with_region_refined_threshold_classification.csv'
df_pre.to_csv(out, index=False, encoding='utf-8')
print(f"Refined results saved to: {out}")


Best threshold: 0.36, tuned WQ F1: 0.9362
Refined results saved to: /content/projects_with_region_refined_threshold_classification.csv


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

df_pre = pd.read_csv('/content/projects_with_region_preprocessed.csv', converters={'tokens': eval}, encoding='utf-8')
df_pre['text'] = df_pre['tokens'].apply(lambda lst: ' '.join(lst))
df_cls = pd.read_csv('/content/projects_with_region_fast_classification.csv', encoding='utf-8')
df_pre['label'] = df_cls['fast_final_class']

X = df_pre['text']
y = df_pre['label']
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

tfidf = TfidfVectorizer(max_df=0.81, min_df=9, ngram_range=(1,2), strip_accents='unicode')
Xtr = tfidf.fit_transform(X_tr)
Xte = tfidf.transform(X_te)

clf = LogisticRegression(C=1.62, solver='liblinear', class_weight='balanced', max_iter=500, random_state=42)
clf.fit(Xtr, y_tr)

proba = clf.predict_proba(Xte)
classes = clf.classes_.tolist()
i_wq = classes.index('Water Quality & Urban Pollution')
i_me = classes.index('Marine Ecology & Conservation')

best_t, best_f1 = 0, 0
for t in np.linspace(0.1, 0.9, 81):
    preds = clf.predict(Xte)
    mask = (proba[:, i_me] >= proba.max(axis=1)) & (proba[:, i_wq] > t)
    preds[mask] = 'Water Quality & Urban Pollution'
    f1 = f1_score(y_te, preds, labels=['Water Quality & Urban Pollution'], average='macro')
    if f1 > best_f1:
        best_f1, best_t = f1, t

print(f"[Bi-gram] Best threshold = {best_t:.2f}, WQ F1 = {best_f1:.4f}")

preds = clf.predict(Xte)
mask = (proba[:, i_me] >= proba.max(axis=1)) & (proba[:, i_wq] > best_t)
preds[mask] = 'Water Quality & Urban Pollution'
print(classification_report(y_te, preds, digits=4))

X_all = tfidf.transform(df_pre['text'])
proba_all = clf.predict_proba(X_all)
pred_all = clf.classes_[np.argmax(proba_all, axis=1)]
mask_all = (proba_all[:, i_me] >= np.max(proba_all, axis=1)) & (proba_all[:, i_wq] > best_t)
pred_all[mask_all] = 'Water Quality & Urban Pollution'

df_pre['refined_ngram_pred'] = pred_all
out = '/content/projects_with_region_refined_ngram_threshold.csv'
df_pre.to_csv(out, index=False, encoding='utf-8')
print(f"Saved to: {out}")


[Bi-gram] Best threshold = 0.35, WQ F1 = 0.9437
                                           precision    recall  f1-score   support

             Cultural & Political History     0.9542    0.9900    0.9718       400
                     Food Supply & Safety     0.9341    0.9140    0.9239        93
          International Research Programs     0.9513    0.9255    0.9382       443
            Marine Ecology & Conservation     0.9058    0.9395    0.9224       215
                Medical Therapy & Disease     0.9534    0.9779    0.9655       544
             Neuroscience & Brain Imaging     0.9714    0.9007    0.9347       151
            Optical & Theoretical Physics     0.9687    0.9593    0.9640       516
Renewable Energy & Circular Manufacturing     0.9450    0.9576    0.9513       377
            Security & Computing Networks     0.9556    0.9227    0.9389       233
          Water Quality & Urban Pollution     0.9710    0.9178    0.9437        73

                                 accu

In [22]:
#!/usr/bin/env python3
import pandas as pd

orig = pd.read_csv('/content/projects_with_region.csv', encoding='utf-8', engine='python', on_bad_lines='skip')
ref = pd.read_csv('/content/projects_with_region_refined_ngram_threshold.csv', encoding='utf-8')

assert len(orig) == len(ref), f"Row count mismatch: {len(orig)} vs {len(ref)}"

topic_cols = [c for c in ref.columns if c.startswith('refined')]
if not topic_cols:
    raise KeyError("No column starting with 'refined' found in classification results")
topic_col = topic_cols[0]

orig['topic'] = ref[topic_col]

out_path = '/content/projects_with_region_with_topic.csv'
orig.to_csv(out_path, index=False, encoding='utf-8')
print(f"Saved to: {out_path} (added topic from {topic_col})")


Saved to: /content/projects_with_region_with_topic.csv (added topic from refined_ngram_pred)


In [23]:
import pandas as pd

df = pd.read_csv('/content/projects_with_region_with_topic.csv', encoding='utf-8')
df = df.loc[:, ~df.columns.str.startswith('Unnamed')]
df.to_csv('/content/projects_with_region_with_topic.csv', index=False, encoding='utf-8')
print('/content/projects_with_region_with_topic.csv')


/content/projects_with_region_with_topic.csv
