In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 5.5
fig_height = 3.5
fig_format = 'pdf'
fig_dpi = 300

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  pio.renderers.default = "notebook_connected"
except Exception:
  pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass



# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'/Users/iman/Documents/GitHub/Omnichannel-LLMs':
  os.chdir(r'/Users/iman/Documents/GitHub/Omnichannel-LLMs')

# reset state
%reset

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v
  
  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define





`set_matplotlib_formats` is deprecated since IPython 7.23, directly use `matplotlib_inline.backend_inline.set_matplotlib_formats()`



In [2]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
import spacy

In [3]:
chatgpt = pd.read_csv('chatgpt responses.csv')
chatgpt.columns = ['billboard', 'website', 'salesman']

In [4]:
def discard_quatation_marks(sentence):
    """Discards " from a sentence.

    Args:
    sentence: A string representing a sentence.

    Returns:
    A string representing the sentence without quotation marks.
    """

    sentence = sentence[1:-1]
    return(sentence)

chatgpt = chatgpt.applymap(discard_quatation_marks)

In [5]:
def str_len(x):
    return x.str.len()

chatgpt_counts = chatgpt.apply(str_len)

In [6]:
#| tbl-cap: Words Summary Statistics
#| label: tbl-wordsum
round(chatgpt_counts.describe(), 2)


In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



Unnamed: 0,billboard,website,salesman
count,50.0,50.0,50.0
mean,37.64,46.32,85.14
std,7.06,8.3,8.33
min,20.0,28.0,68.0
25%,34.0,42.25,79.0
50%,38.0,45.5,85.0
75%,41.75,52.0,91.0
max,53.0,64.0,103.0


In [7]:
def word_count(sentences):
    joined_list = ' '.join(sentences)
    tokens = [token for token in word_tokenize(joined_list.lower()) if token.isalpha()]
    no_stops = [nos_token for nos_token in tokens if nos_token not in stopwords.words('english')]
    count = Counter(no_stops)
    cloud_generate = WordCloud(background_color='white').\
    generate(' '.join(no_stops))
    plt.imshow(cloud_generate, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [8]:
word_count(chatgpt['billboard'])
word_count(chatgpt['website'])
word_count(chatgpt['salesman'])

<Figure size 1650x1050 with 1 Axes>

<Figure size 1650x1050 with 1 Axes>

<Figure size 1650x1050 with 1 Axes>