In [1]:
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer

## Only Gemini shown in this notebook, but the same functions were used for all responses

In [2]:
df = pd.read_csv("gemini.csv")
df.rename(columns={'index': 'row'}, inplace=True)

In [3]:
# Data from the 450 million word Corpus of Contemporary American English (COCA), length = 100
common = ['the', 'to', 'and', 'of', 'a', 'in', 'i', 'that', 'you', 'it', 'is',
                       'for', 'on', 'was', 'he', 'with', 'this', 'as', "n't", 'we', 'be',
                       'have', 'are', 'not', 'but', 'at', 'they', 'do', 'what', 'his', 'from', 
                       'by', 'or', 'she', 'my', 'all', 'an', 'there', 'so', 'her', 'about',
                       'me', 'one', 'had', 'if', 'your', 'can', 'who', 'no', 'out', 'has',
                       'their', 'were', 'like', 'just', 'would', 'up', 'when', 'more', 'will',
                       'know', 'said', 'did', 'been', 'people', 'get', 'him', 'time', 'them',
                       'some', 'how', 'now', 'which', 'could', 'think', 'than', 'our', 'into',
                       'other', 'right', 'here', 'well', 'new', 'then', 'because', 'go', 'see',
                       'back', 'only', 'these', 'over', 'going', 'us', 'also', 'two', 'first',
                       'its', 'even', 'good', 'way']

In [4]:
map_common = {}
for i, word in enumerate(common):
    map_common[word] = i

In [5]:
# Function word list pulled from James O’Shea's research on dialogue act classification, length = 277
function = ["a", "about", "above", "across", "after", "afterwards", "again", "against", 
                    "all", "almost", "alone", "along", "already", "also", "although", "always", 
                    "am", "among", "amongst", "amoungst", "an", "and", "another", "any", "anyhow", 
                    "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", 
                    "be", "became", "because", "been", "before", "beforehand", "behind", "being", 
                    "below", "beside", "besides", "between", "beyond", "both", "but", "by", "can", 
                    "cannot", "could", "dare", "despite", "did", "do", "does", "done", "down", 
                    "during", "each", "eg", "either", "else", "elsewhere", "enough", "etc", "even", 
                    "ever", "every", "everyone", "everything", "everywhere", "except", "few", 
                    "first", "for", "former", "formerly", "from", "further", "furthermore", "had", 
                    "has", "have", "he", "hence", "her", "here", "hereabouts", "hereafter", 
                    "hereby", "herein", "hereinafter", "heretofore", "hereunder", "hereupon", 
                    "herewith", "hers", "herself", "him", "himself", "his", "how", "however", "i", 
                    "ie", "if", "in", "indeed", "inside", "instead", "into", "is", "it", "its", 
                    "itself", "last", "latter", "latterly", "least", "less", "lot", "lots", "many", 
                    "may", "me", "meanwhile", "might", "mine", "more", "moreover", "most", "mostly", 
                    "much", "must", "my", "myself", "namely", "near", "need", "neither", "never", 
                    "nevertheless", "next", "no", "nobody", "none", "noone", "nor", "not", "nothing", 
                    "now", "nowhere", "of", "off", "often", "oftentimes", "on", "once", "one", 
                    "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", 
                    "ourselves", "out", "outside", "over", "per", "perhaps", "rather", "re", 
                    "same", "second", "several", "shall", "she", "should", "since", "so", "some", 
                    "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", 
                    "still", "such", "than", "that", "the", "their", "theirs", "them", "themselves", 
                    "then", "thence", "there", "thereabouts", "thereafter", "thereby", "therefore", 
                    "therein", "thereof", "thereon", "thereupon", "these", "they", "third", "this", 
                    "those", "though", "through", "throughout", "thru", "thus", "to", "together", 
                    "too", "top", "toward", "towards", "under", "until", "up", "upon", "us", 
                    "used", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", 
                    "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", 
                    "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", 
                    "whole", "whom", "whose", "why", "whyever", "will", "with", "within", "without", 
                    "would", "yes", "yet", "you", "your", "yours", "yourself", "yourselves"]

In [6]:
map_function = {}
for i, word in enumerate(function):
    map_function[word] = i

In [7]:
r = r"(?u)\b[\w']*+\b"

In [8]:
common_vectorizer = CountVectorizer(token_pattern=r, vocabulary = map_common)
function_vectorizer = CountVectorizer(token_pattern=r, vocabulary = map_function)

In [9]:
dtm = common_vectorizer.fit_transform(df['response'])

dtm_df = pd.DataFrame(dtm.toarray(), columns=common_vectorizer.get_feature_names_out())

common = dtm_df.apply(lambda row: row.values.tolist(), axis=1)

In [10]:
dtm = function_vectorizer.fit_transform(df['response'])

dtm_df = pd.DataFrame(dtm.toarray(), columns=function_vectorizer.get_feature_names_out())

function = dtm_df.apply(lambda row: row.values.tolist(), axis=1)

In [11]:
gemini_style_vectors = pd.DataFrame({
    'prompt': df.prompt,
    'response': df.response,
    'common': common,
    'function': function
})

gemini_style_vectors

Unnamed: 0,prompt,response,common,function
0,Respond to the following creative writing prom...,In the shadowy realm where space and time inte...,"[51, 7, 13, 27, 20, 12, 17, 10, 0, 4, 3, 4, 0,...","[20, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,..."
1,Respond to the following creative writing prom...,"In the clandestine world of time travel, where...","[32, 13, 17, 24, 20, 7, 10, 3, 0, 1, 0, 4, 2, ...","[20, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,Respond to the following creative writing prom...,"In the year 2342, time travel had become a pop...","[22, 5, 17, 8, 12, 3, 10, 1, 0, 0, 0, 5, 2, 5,...","[12, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,Respond to the following creative writing prom...,"In the intricate labyrinth of time, where mome...","[42, 12, 11, 29, 10, 12, 13, 6, 0, 2, 0, 2, 1,...","[10, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,..."
4,Respond to the following creative writing prom...,In the realm of temporal anomalies and audacio...,"[58, 15, 24, 37, 20, 14, 12, 5, 0, 0, 1, 3, 3,...","[20, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0,..."
...,...,...,...,...
795,"To get in Heaven, you have to confront the per...",In the hushed tranquility of the celestial rea...,"[23, 11, 14, 9, 9, 6, 21, 7, 0, 5, 0, 4, 0, 6,...","[9, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
796,"To get in Heaven, you have to confront the per...","As I arrived at the pearly gates of Heaven, my...","[23, 16, 14, 7, 12, 6, 36, 3, 2, 3, 0, 2, 1, 5...","[12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
797,"To get in Heaven, you have to confront the per...","In the ethereal realm, where clouds caressed t...","[24, 8, 8, 16, 7, 11, 27, 13, 0, 4, 0, 4, 1, 8...","[7, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
798,"To get in Heaven, you have to confront the per...","In the ethereal realm of the afterlife, I stoo...","[23, 10, 25, 14, 10, 12, 35, 10, 0, 3, 0, 4, 1...","[10, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0,..."


In [12]:
gemini_style_vectors.to_csv("gemini_style_vectors.csv")