In [1]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
import numpy as np

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to preprocess a single journal entry
def preprocess_entry(entry):
    # Tokenization
    tokens = word_tokenize(entry)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    
    return tokens

# Function to get synonyms of a word using WordNet
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

# Function to assign core value to preprocessed journal entry
def assign_core_value(entry, core_values_keywords):
    # Initialize dictionary to store keyword frequency for each core value
    keyword_freq = {core_value: 0 for core_value in core_values_keywords.keys()}
    
    # Count frequency of core value keywords and their synonyms in the entry
    for word in entry:
        for core_value, keywords in core_values_keywords.items():
            if word in keywords:
                keyword_freq[core_value] += 1
            else:
                # Check if word has any synonyms in core value keywords
                synonyms = get_synonyms(word)
                for syn in synonyms:
                    if syn in keywords:
                        keyword_freq[core_value] += 1
    
    # Get core value with maximum keyword frequency
    max_core_value = max(keyword_freq, key=keyword_freq.get)
    
    return max_core_value

# Read journal entries from the text file
with open('journal_entries.txt', 'r') as file:
    journal_entries = file.readlines()
    journal_entries = [entry.strip() for entry in journal_entries]

# Read core values and their keywords from the text file
with open('core_values_keywords.txt', 'r') as file:
    core_values_keywords = eval(file.read())

# Preprocess all journal entries
preprocessed_entries = [preprocess_entry(entry) for entry in journal_entries]

# Flatten preprocessed entries into sentences
flattened_entries = [' '.join(entry) for entry in preprocessed_entries]

# Convert assigned core values to numerical labels
labels = np.array([assign_core_value(entry, core_values_keywords) for entry in preprocessed_entries])

# Vectorize the text data using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(flattened_entries)

# Train the Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X, labels)

# Save the trained model and vectorizer for later use
import joblib
joblib.dump(svm_classifier, 'svm_model.joblib')
joblib.dump(vectorizer, 'vectorizer.joblib')

print("Model training completed.")


[nltk_data] Downloading package punkt to /Users/kartik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kartik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kartik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Model training completed.
