In [None]:
!pip install -U langchain-dartmouth > /dev/null

from langchain_dartmouth.llms import ChatDartmouth, DartmouthLLM
from langchain_core.messages import HumanMessage, SystemMessage

from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer

from matplotlib import pyplot as plt
from sklearn.decomposition import PCA

import os
import numpy as np

import pandas as pd

In [None]:
os.environ["DARTMOUTH_API_KEY"] = "KEY_GOES_HERE"

In [None]:
def format_prompt(prompt):
    """
    This function will format a prompt into what is needed for LangChain to produce ChatML.
    Args:
       prompt: the text to be embedded as human prompt.
    """
    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=prompt),
    ]
    return messages

In [None]:
max_new_tokens = 1024
top_p = 0.75 # If set to < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
temperature = 1.0 #  Strictly positive float value used to modulate the logits distribution. A value smaller than 1 decreases randomness (and vice versa), with 0 being equivalent to shifting all probability mass to the most likely token
repetition_penalty = None

system_prompt = "Always assist with care, respect, and truth. Respond with utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies promote fairness and positivity."

kwargs = dict()

In [None]:
llm_chat = ChatDartmouth(model_name="llama-3-1-8b-instruct",
                    temperature = temperature,
                    top_p = top_p,
                    max_tokens = max_new_tokens,
                    model_kwargs=kwargs)

In [None]:
# define prompt for iteration
prompt = "Write a short story about a student's first year attending Dartmouth College. Use normal paragraph structure. Include experiences and events in each of the three major terms: fall, winter, and spring."

# create list to store generated texts.
outputs = list()

# iterate through and save stories to output list
for i in range(10):
          outputs.append(llm_chat.invoke(format_prompt(prompt)).content)

In [None]:
# vectorize and create document-term matrix for modeling
vectorizer = CountVectorizer(input='content',
                             strip_accents='unicode',
                             stop_words='english')
dtm = vectorizer.fit_transform(outputs)
idx2voc = {v:k for k, v in vectorizer.vocabulary_.items()}

In [None]:
vocab_sums = dtm.sum(axis=0)
sorted_vocab = [(v, vocab_sums[0, i]) for v, i in vectorizer.vocabulary_.items()]
sorted_vocab = sorted(sorted_vocab, key = lambda x: x[1], reverse=True)

# display top twenty-five words
for i in range(25):
    print(sorted_vocab[i][0],"->",sorted_vocab[i][1])

In [None]:
# small helper function to examining presence of specific words
def term_debug(term):
    if term in vectorizer.vocabulary_:
        idx = vectorizer.vocabulary_[term]
    else:
        print("Error: {0} not on vocabulary".format(term))
        return
    tc = int(np.sum(dtm,axis=0)[:, idx].item())
    tm = float(np.mean(dtm,axis=0)[:, idx].item())
    return pd.DataFrame({'Total Count':tc,'Mean Count':tm},  index=[term])

In [None]:
term_debug('academic')

In [None]:
for doc in range(dtm.shape[0]):
  doc1 = dtm[doc].toarray()[0]
  rest = dtm[np.arange(dtm.shape[0]) != doc, :].toarray()[0]

  diff = np.abs(doc1 - rest)
  top_diff_indices = np.argsort(diff)[-10:]

  print("Document {0}: Top 10 different terms:".format(doc))
  for index in top_diff_indices:
      print(idx2voc[index],diff[index])
  print("\n")

In [None]:
# Plot the generated stories from the document-term matrix.
pca = PCA(n_components=2)
plot_data = pca.fit_transform(dtm)
xs, ys = plot_data[:, 0], plot_data[:, 1]

fig = plt.figure(figsize=(20, 15))
plt.clf()
plt.title("PCA of Generated Stories")
plt.style.use('ggplot')
plt.scatter(xs, ys, marker = 'o')
for i in range(dtm.shape[0]):
         plt.annotate(i, xy = (xs[i], ys[i]), xytext = (3, 3),
            textcoords = 'offset points', ha = 'left', va = 'top')
dtm.shape[0]
plt.show()

In [None]:
from IPython.display import display, HTML
for story in outputs:
  display(HTML('<div>' + story + '</div>'))
  display(HTML('<hr>'))