In [1]:
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer

## Only Gemini shown in this notebook, but the same functions were used for all responses

In [2]:
df = pd.read_csv("gemini.csv")
df.rename(columns={'index': 'row'}, inplace=True)

In [3]:
# Data from the 450 million word Corpus of Contemporary American English (COCA), length = 100
most_common_english = ['the', 'to', 'and', 'of', 'a', 'in', 'i', 'that', 'you', 'it', 'is',
                       'for', 'on', 'was', 'he', 'with', 'this', 'as', "n't", 'we', 'be',
                       'have', 'are', 'not', 'but', 'at', 'they', 'do', 'what', 'his', 'from', 
                       'by', 'or', 'she', 'my', 'all', 'an', 'there', 'so', 'her', 'about',
                       'me', 'one', 'had', 'if', 'your', 'can', 'who', 'no', 'out', 'has',
                       'their', 'were', 'like', 'just', 'would', 'up', 'when', 'more', 'will',
                       'know', 'said', 'did', 'been', 'people', 'get', 'him', 'time', 'them',
                       'some', 'how', 'now', 'which', 'could', 'think', 'than', 'our', 'into',
                       'other', 'right', 'here', 'well', 'new', 'then', 'because', 'go', 'see',
                       'back', 'only', 'these', 'over', 'going', 'us', 'also', 'two', 'first',
                       'its', 'even', 'good', 'way']

In [4]:
# Function word list pulled from James O’Shea's research on dialogue act classification, length = 277
english_function = ["a", "about", "above", "across", "after", "afterwards", "again", "against", 
                    "all", "almost", "alone", "along", "already", "also", "although", "always", 
                    "am", "among", "amongst", "amoungst", "an", "and", "another", "any", "anyhow", 
                    "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", 
                    "be", "became", "because", "been", "before", "beforehand", "behind", "being", 
                    "below", "beside", "besides", "between", "beyond", "both", "but", "by", "can", 
                    "cannot", "could", "dare", "despite", "did", "do", "does", "done", "down", 
                    "during", "each", "eg", "either", "else", "elsewhere", "enough", "etc", "even", 
                    "ever", "every", "everyone", "everything", "everywhere", "except", "few", 
                    "first", "for", "former", "formerly", "from", "further", "furthermore", "had", 
                    "has", "have", "he", "hence", "her", "here", "hereabouts", "hereafter", 
                    "hereby", "herein", "hereinafter", "heretofore", "hereunder", "hereupon", 
                    "herewith", "hers", "herself", "him", "himself", "his", "how", "however", "i", 
                    "ie", "if", "in", "indeed", "inside", "instead", "into", "is", "it", "its", 
                    "itself", "last", "latter", "latterly", "least", "less", "lot", "lots", "many", 
                    "may", "me", "meanwhile", "might", "mine", "more", "moreover", "most", "mostly", 
                    "much", "must", "my", "myself", "namely", "near", "need", "neither", "never", 
                    "nevertheless", "next", "no", "nobody", "none", "noone", "nor", "not", "nothing", 
                    "now", "nowhere", "of", "off", "often", "oftentimes", "on", "once", "one", 
                    "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", 
                    "ourselves", "out", "outside", "over", "per", "perhaps", "rather", "re", 
                    "same", "second", "several", "shall", "she", "should", "since", "so", "some", 
                    "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", 
                    "still", "such", "than", "that", "the", "their", "theirs", "them", "themselves", 
                    "then", "thence", "there", "thereabouts", "thereafter", "thereby", "therefore", 
                    "therein", "thereof", "thereon", "thereupon", "these", "they", "third", "this", 
                    "those", "though", "through", "throughout", "thru", "thus", "to", "together", 
                    "too", "top", "toward", "towards", "under", "until", "up", "upon", "us", 
                    "used", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", 
                    "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", 
                    "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", 
                    "whole", "whom", "whose", "why", "whyever", "will", "with", "within", "without", 
                    "would", "yes", "yet", "you", "your", "yours", "yourself", "yourselves"]

In [5]:
def process_text(text, allowed_words):
    # Remove punctuation and newline characters, convert to lowercase
    translator = str.maketrans("", "", string.punctuation)
    text = text.translate(translator).replace('\n', ' ').lower()
    
    # Keep only words that are in the allowed list
    processed_text = ' '.join(word for word in text.split() if word in allowed_words)
    return processed_text

In [6]:
df["processed_common"] = df["response"].apply(lambda x: process_text(x, most_common_english))
df["processed_function"] = df["response"].apply(lambda x: process_text(x, english_function))

In [7]:
vectorizer = CountVectorizer()

In [8]:
dtm = vectorizer.fit_transform(df['processed_function'])

dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())

function_df = pd.concat([df, dtm_df], axis=1)


exclude_columns = function_df.columns[:6]
function_df = pd.DataFrame({
    'prompt': function_df.prompt,
    'response': function_df.response,
    'SummaryVector': function_df.apply(lambda row: row[~row.index.isin(exclude_columns)].values.tolist(), axis=1)
})

In [9]:
dtm = vectorizer.fit_transform(df['processed_common'])

dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())

common_df = pd.concat([df, dtm_df], axis=1)

exclude_columns = common_df.columns[:6]
common_df = pd.DataFrame({
    'prompt': common_df.prompt,
    'response': common_df.response,
    'SummaryVector': common_df.apply(lambda row: row[~row.index.isin(exclude_columns)].values.tolist(), axis=1)
})

In [10]:
gemini_style_vectors = pd.DataFrame({
    'prompt': common_df.prompt,
    'response': common_df.response,
    'common_vectors': common_df.SummaryVector,
    'function_vectors': function_df.SummaryVector
})

gemini_style_vectors

Unnamed: 0,prompt,response,common_vectors,function_vectors
0,Respond to the following creative writing prom...,In the shadowy realm where space and time inte...,"[0, 2, 13, 1, 6, 1, 1, 2, 0, 1, 0, 0, 1, 3, 0,...","[1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1,Respond to the following creative writing prom...,"In the clandestine world of time travel, where...","[0, 2, 17, 0, 10, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
2,Respond to the following creative writing prom...,"In the year 2342, time travel had become a pop...","[0, 2, 17, 0, 8, 0, 0, 0, 0, 1, 1, 3, 0, 0, 0,...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Respond to the following creative writing prom...,"In the intricate labyrinth of time, where mome...","[0, 6, 11, 0, 7, 0, 0, 0, 0, 0, 2, 3, 0, 2, 0,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Respond to the following creative writing prom...,In the realm of temporal anomalies and audacio...,"[0, 0, 24, 0, 7, 1, 0, 1, 0, 0, 2, 0, 0, 1, 0,...","[0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ..."
...,...,...,...,...
795,"To get in Heaven, you have to confront the per...",In the hushed tranquility of the celestial rea...,"[1, 2, 14, 0, 1, 0, 0, 1, 0, 3, 4, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
796,"To get in Heaven, you have to confront the per...","As I arrived at the pearly gates of Heaven, my...","[0, 2, 13, 0, 7, 2, 2, 1, 0, 2, 2, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
797,"To get in Heaven, you have to confront the per...","In the ethereal realm, where clouds caressed t...","[0, 3, 8, 0, 6, 0, 1, 1, 0, 1, 4, 3, 0, 1, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
798,"To get in Heaven, you have to confront the per...","In the ethereal realm of the afterlife, I stoo...","[2, 2, 25, 0, 6, 1, 2, 4, 0, 5, 6, 0, 0, 0, 0,...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, ..."


In [11]:
gemini_style_vectors.to_csv("gemini_style_vectors.csv")