# Train and Save Logistic Regression Model for Research Theme Prediction

This notebook trains a logistic regression model to predict research themes based on project titles and saves the trained artifacts for use in the TimeArcs recommendation system.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pickle
import os

# Install required packages if needed
import subprocess
import sys

packages = ['pandas', 'numpy', 'scikit-learn']
for package in packages:
    try:
        __import__(package.replace('-', '_'))
    except ImportError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

## Section 1: Load and Preprocess Data

In [2]:
# Load and preprocess data
df = pd.read_csv('grants_final.tsv', sep='\t')
df = df[['Title', 'Theme']].copy()
df = df.drop_duplicates()
df = df.dropna()

# Clean up the data
df['Title'] = df['Title'].str.strip('"')
df['Theme'] = df['Theme'].str.strip("'")

print(f"Total samples before filtering: {len(df)}")
print(f"Number of unique themes before filtering: {df['Theme'].nunique()}")

# Set threshold for minimum number of samples per theme
MIN_SAMPLES_PER_THEME = 4

theme_counts = df['Theme'].value_counts()
print(f"\nTheme distribution before filtering:\n{theme_counts}")

# Keep only themes with at least MIN_SAMPLES_PER_THEME samples
valid_themes = theme_counts[theme_counts >= MIN_SAMPLES_PER_THEME].index
df = df[df['Theme'].isin(valid_themes)]

print(f"\n{'='*60}")
print(f"After filtering themes with < {MIN_SAMPLES_PER_THEME} samples:")
print(f"Total samples: {len(df)}")
print(f"Number of unique themes: {df['Theme'].nunique()}")
print(f"\nTheme distribution after filtering:\n{df['Theme'].value_counts()}")

Total samples before filtering: 52
Number of unique themes before filtering: 17

Theme distribution before filtering:
Theme
Education & Workforce Development         13
Healthcare / Biomedical                    7
Software / Systems                         5
Cybersecurity                              5
AI / Machine Learning                      4
Algorithms / Theory / Optimization         3
Data / Metadata / Scientific Data          2
Environment & Agriculture                  2
Energy / Environment                       2
Quantum                                    2
Blockchain & Privacy                       1
Robotics & CPS                             1
Disaster / Resilience / Evacuation         1
Visualization / Analytics                  1
Manufacturing / Advanced Manufacturing     1
Sensors / IoT / Edge                       1
Networking & Communications                1
Name: count, dtype: int64

After filtering themes with < 4 samples:
Total samples: 34
Number of unique themes: 

## Section 2: Train TF-IDF Vectorizer and Logistic Regression Model

In [3]:
# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Theme'])

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

print(f"\nTraining samples: {len(train_df)}")
print(f"Testing samples: {len(test_df)}")

# Create and fit TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['Title'])
y_train = train_df['label']
X_test_tfidf = tfidf_vectorizer.transform(test_df['Title'])
y_test = test_df['label']

# Train logistic regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train_tfidf, y_train)

# Evaluate model
y_pred = logistic_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder.classes_,
    zero_division=0
))


Training samples: 27
Testing samples: 7

Model Accuracy: 0.4286

Classification Report:
                                   precision    recall  f1-score   support

            AI / Machine Learning       0.00      0.00      0.00         1
                    Cybersecurity       0.00      0.00      0.00         1
Education & Workforce Development       0.43      1.00      0.60         3
          Healthcare / Biomedical       0.00      0.00      0.00         1
               Software / Systems       0.00      0.00      0.00         1

                         accuracy                           0.43         7
                        macro avg       0.09      0.20      0.12         7
                     weighted avg       0.18      0.43      0.26         7



## Section 3: Save Model Artifacts

In [4]:
# Create model_artifacts directory if it doesn't exist
artifact_dir = 'model_artifacts'
os.makedirs(artifact_dir, exist_ok=True)

# Save the trained models and vectorizer
model_path = os.path.join(artifact_dir, 'logistic_model.pkl')
vectorizer_path = os.path.join(artifact_dir, 'tfidf_vectorizer.pkl')
encoder_path = os.path.join(artifact_dir, 'label_encoder.pkl')

with open(model_path, 'wb') as f:
    pickle.dump(logistic_model, f)
    print(f"✓ Saved logistic regression model to {model_path}")

with open(vectorizer_path, 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
    print(f"✓ Saved TF-IDF vectorizer to {vectorizer_path}")

with open(encoder_path, 'wb') as f:
    pickle.dump(label_encoder, f)
    print(f"✓ Saved label encoder to {encoder_path}")

print(f"\n✓ All model artifacts saved to '{artifact_dir}/' directory")
print(f"Ready for use in the recommendation system!")

✓ Saved logistic regression model to model_artifacts\logistic_model.pkl
✓ Saved TF-IDF vectorizer to model_artifacts\tfidf_vectorizer.pkl
✓ Saved label encoder to model_artifacts\label_encoder.pkl

✓ All model artifacts saved to 'model_artifacts/' directory
Ready for use in the recommendation system!
