In [7]:
from constants import posts_token, flagged_posts, participants_token, participants_table1
from functions import pull_redcap_report, get_age_at_post

posts = pull_redcap_report(posts_token, flagged_posts)
participants = pull_redcap_report(participants_token, participants_table1)
posts = get_age_at_post(posts, participants)

In [8]:
# Text Processing
import nltk
from nltk.stem import WordNetLemmatizer
# Check if "wordnet" is available before downloading
try:
    from nltk.corpus import wordnet
except:
    nltk.download("wordnet")
    from nltk.corpus import wordnet
from functions import tokenize_and_remove_stopwords, clean_text, get_cluster_features
lemmatizer = WordNetLemmatizer()

posts['clean_text'] = posts['text'].apply(clean_text)
posts['tokenized_text'] = posts['clean_text'].apply(tokenize_and_remove_stopwords)
posts['lemma'] = posts['tokenized_text'].apply(lambda text: " ".join([lemmatizer.lemmatize(word) for word in text.split()]))


posts["text_cluster"] = posts['clean_text'].apply(get_cluster_features)

In [9]:
import joblib
from scipy.sparse import hstack

# Load vectorizers
lemma_vectorizer = joblib.load("../classifiers-2-2-25/lemma_vectorizer.joblib")
cluster_vectorizer = joblib.load("../classifiers-2-2-25/cluster_vectorizer.joblib")

X_cluster = cluster_vectorizer.transform(posts['text_cluster'])
X_lemma = lemma_vectorizer.transform(posts['lemma'])

# Load Scaler
scaler = joblib.load("../classifiers-2-2-25/scaler.joblib")
X_age = scaler.transform(posts[['age_at_post']])

# Load encoder
encoder = joblib.load("../classifiers-2-2-25/encoder.joblib")
X_cat = encoder.transform(posts[['gender', 'pd_yesno']])

# Stack all features together
X = hstack([X_cluster, X_lemma, X_age, X_cat])

In [10]:
nb = joblib.load("../classifiers-2-2-25/NaiveBayes_best_model.joblib")
# Predict `y` using the ensemble model
y_pred = nb.predict(X)

# Add predictions to the dataframe
posts['nb_classification'] = y_pred


In [11]:
import_df = posts[
    ['record_id', 
     'nb_classification']]

import_df

Unnamed: 0,record_id,nb_classification
0,62,0
1,79,0
2,80,0
3,82,0
4,84,0
...,...,...
22876,201967,0
22877,201969,0
22878,201971,0
22879,201998,0


In [12]:
from functions import redcap_upload

response = redcap_upload(import_df) # note this is ALWAYS to posts

Import complete.
