Running all cells in order will set up the pipeline to perform style transfer. The content of the cells at the very bottom of the notebook may be edited to perform style transfer on new input sentences.

Note that the average processing time for a single sentence is about 1 minute.

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('brown')
from nltk.corpus import brown
from nltk.tokenize.treebank import TreebankWordDetokenizer # https://stackoverflow.com/questions/21948019/python-untokenize-a-sentence
from nltk.collocations import *
nltk.download('averaged_perceptron_tagger')
from collections import Counter

import numpy as np
import scipy
import math

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
!pip install wget


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9657 sha256=ce3f8ca62f3dd2669137bca3259343193404635be87331b5cd7c1a743b98c55d
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
!wget -q -O british_corpus.txt "https://drive.google.com/uc?export=download&id=1aat6dMUkbAUt95qlSaCXzh2ryeT7NVBp"
with open('british_corpus.txt') as f:
  british_corpus = f.read()
british_corpus = nltk.word_tokenize(british_corpus)

In [None]:
from nltk.corpus import stopwords
import string
nltk_stopwords = stopwords.words('english')
stopwords_punc = set(nltk_stopwords + list(string.punctuation))

from nltk.probability import FreqDist

In [None]:
british_corpus_clean = [word.lower() for word in british_corpus if word.isalpha()]
british_corpus_clean = [s for s in british_corpus_clean if s not in stopwords_punc]
allWordDist = nltk.FreqDist(w.lower() for w in british_corpus_clean)

brown = nltk.corpus.brown.words()
brown_clean = [word.lower() for word in brown if word.isalpha()]
brown_clean = [s for s in brown_clean if s not in stopwords_punc]
brownWordDist = nltk.FreqDist(w.lower() for w in brown_clean)


# creates a dictionary that associates each word with a unique value
dictionary = {}
index = 0
for word in allWordDist:
    dictionary[word] = index
    index += 1

In [None]:
# testing
print(british_corpus_clean[:100])
print(len(allWordDist))
print(allWordDist["small"])
print(brownWordDist["little"])
print(allWordDist["ok"])
print(dictionary)

['racism', 'still', 'problem', 'within', 'society', 'today', 'many', 'ethnic', 'minorities', 'face', 'inequalities', 'many', 'areas', 'including', 'education', 'housing', 'employment', 'ethnic', 'minorities', 'concentrated', 'certain', 'areas', 'job', 'market', 'manufacture', 'communication', 'brown', 'likely', 'victims', 'assault', 'abercrombie', 'et', 'al', 'recent', 'surveys', 'shown', 'racist', 'ideas', 'still', 'exist', 'society', 'seen', 'survey', 'carried', 'asked', 'white', 'sample', 'whether', 'agreed', 'disagreed', 'statement', 'enriched', 'quality', 'life', 'britain', 'nearly', 'half', 'sample', 'disagreed', 'abercrombie', 'et', 'al', 'essay', 'look', 'racism', 'defined', 'contemporary', 'society', 'explore', 'still', 'persists', 'section', 'cover', 'three', 'areas', 'think', 'contributed', 'continued', 'existence', 'racism', 'culture', 'economy', 'politics', 'meant', 'racism', 'integral', 'internalised', 'part', 'society', 'however', 'looking', 'individual', 'specific', 'ca

In [None]:
# imports for misc. word substitution functions
from collections import Counter
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk

!pip install ety
import ety

!pip install lemminflect
import lemminflect

[nltk_data] Downloading package wordnet to /root/nltk_data...


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ety
  Downloading ety-1.4.0-py2.py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting treelib (from ety)
  Downloading treelib-1.6.4-py3-none-any.whl (18 kB)
Collecting colorful (from ety)
  Downloading colorful-0.5.5-py2.py3-none-any.whl (201 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.4/201.4 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: colorful, treelib, ety
Successfully installed colorful-0.5.5 ety-1.4.0 treelib-1.6.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lemminflect
  Downloading lemminflect-0.2.3-py3-none-any.whl (769 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m769.7/769.7 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00

In [None]:
# read and tokenize file for British English Academic corpus

freq_dict = Counter(british_corpus) # used for word substitutions
initial_sentence_example = "hello world"
bigram_count = dict(Counter(nltk.bigrams(british_corpus_clean))) # counter containing frequencies of each bigram
bigram_dict = {}
for thing in bigram_count:
  word1 = thing[0]
  word2 = thing[1]
  freq = bigram_count[thing]
  tup = (word2, word1)
  bigram_dict[tup] = freq

In [None]:
# uses lemminflect to make word 2 match word1's inflection
# input: word1 as tuple (word, pos_tag), word2 as string (word)
def match_inflection(word1, word2):
  penntag1 = word1[1]

  nouns = set(["NN", "NNS"])
  verbs = set(["VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "MD"])
  adjs = set(["JJ", "JJR", "JJS"])
  advs = set(["RB", "RBR", "RBS"])
  props = set(["NNP", "NNPS"])

  upos = ""
  if penntag1 in verbs:
    upos = "VERB"
  elif penntag1 in adjs:
    upos = "ADJ"
  elif penntag1 in advs:
    upos = "ADV"
  elif penntag1 in props:
    upos = "PROPN"
  else:
    upos = "NOUN"

  if lemminflect.isTagBaseForm(penntag1):
    # if this POS is not inflected (i.e. already base lemma) then just return 2nd word
    return word2
  else:
    # may return multiple spellings, this just returns first one
    inflection = lemminflect.getInflection(word2, tag=penntag1, inflect_oov=True)
    if len(inflection) == 0:
      return word2
    else:
      return inflection[0]
    

In [None]:
match_inflection(("students", "NNS"), "educatee")

'educatees'

In [None]:
def synonyms(input_sentence):
  words = nltk.pos_tag(input_sentence)
  # print(words)
  stop = set(stopwords.words('english'))
  new_sentence = []
  punct = set(string.punctuation)
  for word in words:
    w = word[0].lower()
    # TODO: ignore prepositions, proper nouns, and stop words
    if w in stop or word[1] == "PRP" or word[1] == "MD" or w in punct:
      new_sentence.append(word[0])
    else:
      syn_list = find_synonyms((w, word[1]))
      new_choice = choose_synonym(input_sentence, w, word[1], syn_list)
      inflected_new_choice = match_inflection((w, word[1]), new_choice)
      if word[0][0].isupper(): 
        inflected_new_choice = inflected_new_choice.capitalize()
      new_sentence.append(inflected_new_choice)
    
  # new_sentence =  " ".join(new_sentence)
  # new_sentence = new_sentence.replace("_", " ")
  return new_sentence


In [None]:
# input: tuple as (word, pos), output: list of synonyms with the same pos
# TODO: a lot of the synsets don't really have the same sense. Is there a way to figure this out, potentially by implementing some kind of bigram similarity measurement type thing?
def find_synonyms(w):
  word = w[0]
  penntag = w[1]

  if penntag.startswith('VB'):
    pos = wn.VERB
  elif penntag.startswith('JJ'):
    pos = wn.ADJ
  elif penntag.startswith('RB'):
    pos = wn.ADV
  else:
    pos = wn.NOUN

  syn_list = []
  for syn in wn.synsets(word, pos=pos):
    for lemma in syn.lemmas():
        syn_list.append(lemma.name())
  if len(syn_list) == 0:
    syn_list.append(w[0])

  # removes duplicates
  syn_list = [*set(syn_list)]

  return syn_list

In [None]:
def choose_synonym(input_sentence, main_word, word_pos, syn_list):

  
  # converts to wordnet parts of speech
  if word_pos.startswith('VB'):
    pos = wn.VERB
  elif word_pos.startswith('JJ'):
    pos = wn.ADJ
  elif word_pos.startswith('RB'):
    pos = wn.ADV
  else:
    pos = wn.NOUN

  # runs the lesk algorithm: currently set to not factor into synonym selection
  lesk_list = lesk(input_sentence, main_word, pos=pos)
  if lesk_list is not None:
    lesk_lemmas = [x.name() for x in lesk_list.lemmas()]
  else:
    lesk_lemmas = []

  # print(lesk_lemmas)

  max_score = 0
  max_word = None
  main_vec = make_vector(main_word)
  for word in syn_list:
    score = 0

    # adds to score if word is latinate or greek in origin
    if etymology(word):
      score += 2

    # adds to score for less frequent words
    score += frequency_score(word) * 4

    # longer words will get bigger length bonus
    score += (len(word) / 2)

    # adds to score if higher vector similarity to original word
    if word != main_word: 
      vec = make_vector(word)
      euc = euclidean(main_vec, vec)
      score -= (euc - 250) / 100


    # if word in lesk_lemmas:
    #   score += 2

    # print(word + " " + str(score))

    if score >= max_score:
      max_word = word
      max_score = score

  return max_word

In [None]:
# searches the etymological history of the word
# returns true if word is latinate or greek, false if not
def etymology(word):
    t = ety.tree(word, language="eng")
    preferred = "lat", "grc"
    is_preferred = False
    for node in t.all_nodes_itr():
        name = node.identifier
        ind = name.index(":") + 1
        lang = name[ind:]
        if lang in preferred:
            is_preferred = True
    return is_preferred

# finds frequency of word in Brown Corpus. 
def frequency_score(word):
  frequency = brownWordDist[word]
  total = len(brownWordDist)
  # higher score = less frequent word
  score = 1 - frequency/total
  return score



In [None]:
def make_vector(word):
  # l = number of types in the corpus
  l = len(allWordDist)
  if allWordDist[word] == 0:
    return np.array([0] * l)
  
  c = british_corpus_clean[:]
  c = [" "] * 20 + c + [" "] * 20

  frequencies = {}
  count = 0
  for x in range(len(c)):
      if c[x] == word:
          count += 1
          for context in range(1, 21):
              if c[x - context] != " ":
                  if c[x - context] in frequencies:
                      frequencies[c[x - context]] += 1
                  else:
                      frequencies[c[x - context]] = 1
              if c[x + context] != " ":
                  if c[x + context] in frequencies:
                      frequencies[c[x + context]] += 1
                  else:
                      frequencies[c[x + context]] = 1

  vec = [0] * l
  for item in frequencies:
      index = dictionary[item]
      vec[index] = frequencies[item]
  vec = np.array(vec)
  vec = scipy.stats.zscore(vec)
  return vec  

def euclidean(vec1, vec2):
  return math.sqrt(np.sum((vec1 - vec2) ** 2))
  

In [None]:
# misc tests

# tw = "make"
# sl = find_synonyms((tw, "VB"))
# print(sl)
# main_vec = make_vector(tw)

# scores = {}
# for synonym in sl:
#   if synonym == tw:
#     continue
#   test_vec = make_vector(synonym)
#   scores[synonym] = euclidean(main_vec, test_vec)
#   print(tw + " " + synonym + " " + str(euclidean(main_vec, test_vec)))

# new = sorted(scores.items(), key=lambda x:x[1])
# for item in new:
#  print(item)

In [None]:
# insert n many adjectives in front of nouns, choosing the adjs that most frequently occur before those nouns in the corpus
def insert_adjs(input_sentence, n):

  nouns = set(["NN", "NNS"])
  adjectives = set(["JJ", "JJR", "JJS"])

  pos_tagged = nltk.pos_tag(input_sentence)
  for j in reversed(range(len(pos_tagged))):
    word = pos_tagged[j]
    if word[1] in nouns:
      freqs = {}
      for i in bigram_dict:
        if i[0] == word[0]:
          prev_word = i[1]
          prev_word_pos = nltk.pos_tag([prev_word])
          if prev_word_pos[0][1] in adjectives:
            freqs.update({prev_word : bigram_dict[i]})
          
      # print(freqs)
      if "not" in freqs:
        del freqs["not"]
      freq_count = Counter(freqs)
      most_common = freq_count.most_common(n)
      # print(most_common)

      for adj in most_common:
        input_sentence.insert(j, adj[0])

  # print(input_sentence)


  return input_sentence

initial_sentence_example = ["this", "paper", "contains", "research"]
print(insert_adjs(initial_sentence_example, 10))
initial_sentence_example = ["my", "dog", "is", "fuzzy"]
insert_adjs(initial_sentence_example, 3)
initial_sentence_example = ["code", "is", "difficult"]
insert_adjs(initial_sentence_example, 1)


[('recent', 94), ('previous', 80), ('social', 68), ('scientific', 62), ('much', 60), ('agricultural', 60), ('empirical', 56), ('current', 34), ('extensive', 28), ('little', 26)]
[('white', 60), ('green', 24), ('recent', 10), ('seminal', 6), ('hard', 6), ('original', 4), ('academic', 4), ('liberal', 4), ('technical', 4), ('conceptual', 4)]
['this', 'conceptual', 'technical', 'liberal', 'academic', 'original', 'hard', 'seminal', 'recent', 'green', 'white', 'paper', 'contains', 'little', 'extensive', 'current', 'empirical', 'agricultural', 'much', 'scientific', 'social', 'previous', 'recent', 'research']
['this', 'conceptual', 'technical', 'liberal', 'academic', 'original', 'hard', 'seminal', 'recent', 'green', 'white', 'paper', 'contains', 'little', 'extensive', 'current', 'empirical', 'agricultural', 'much', 'scientific', 'social', 'previous', 'recent', 'research']
[('domestic', 4), ('aggressive', 2), ('hot', 2)]
['my', 'hot', 'aggressive', 'domestic', 'dog', 'is', 'fuzzy']
[('moral', 4

['moral', 'code', 'is', 'difficult']

In [None]:
def insert_advs(input_sentence, n):

  before = int(n / 2)
  after = int(n / 2)
  if n % 2 != 0:
    before += 1

  verbs = set(["VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "MD"])
  adverbs = set(["RB", "RBR", "RBS"])
  to_be = set(["be", "am", "are", "is", "was", "were", "being", "been"])
  negs = set(["not", "no", "n't"])

  pos_tagged = nltk.pos_tag(input_sentence)
  for j in reversed(range(len(pos_tagged))):
    word = pos_tagged[j]
    if word[1] in verbs and word[1] not in to_be:
      freqs_before = {}
      freqs_after = {}
      for i in bigram_dict:
        if i[0] == word[0]:
          other_word = i[1]
          other_word_pos = nltk.pos_tag([other_word])
          if other_word_pos[0][1] in adverbs:
            freqs_before.update({other_word : bigram_dict[i]})
        elif i[1] == word[0]:
          other_word = i[0]
          other_word_pos = nltk.pos_tag([other_word])
          if other_word_pos[0][1] in adverbs:
            freqs_after.update({other_word : bigram_dict[i]})

      for item in negs:
        if item in freqs_before:
          del freqs_before[item]
        if item in freqs_after:
          del freqs_after[item]
      # print(freqs)
      freq_before_count = Counter(freqs_before)
      freq_after_count = Counter(freqs_after)
      most_common_before = freq_before_count.most_common(before)
      most_common_after = freq_after_count.most_common(after)
      # print("before", most_common_before)
      # #print("after", most_common_after)

      for adv in most_common_after:
        input_sentence.insert(j + 1, adv[0])
      for adv in most_common_before:
        input_sentence.insert(j, adv[0])

  # print(input_sentence)


  return input_sentence

initial_sentence_example = ["the", "dog", "ran"]
insert_advs(initial_sentence_example, 3)
initial_sentence_example = ["giving", "people", "gifts", "is", "nice"]
insert_advs(initial_sentence_example, 4)
initial_sentence_example = ["I", "write", "essays"]
insert_advs(initial_sentence_example, 2)


before [('therefore', 2), ('indeed', 2)]
after [('away', 8)]
['the', 'dog', 'indeed', 'therefore', 'ran', 'away']
before []
after []
before [('thus', 24), ('thereby', 12)]
after [('away', 12), ('less', 10)]
['thereby', 'thus', 'giving', 'less', 'away', 'people', 'gifts', 'is', 'nice']
before [('also', 14)]
after [('quickly', 2)]
['I', 'also', 'write', 'quickly', 'essays']


['I', 'also', 'write', 'quickly', 'essays']

In [None]:
trial = "pretentious english machine translation project"

adjs_trial = insert_adjs(nltk.word_tokenize(trial), 3)

[('overall', 42), ('whole', 38), ('new', 32)]
[('english', 10), ('direct', 4), ('successful', 4)]
[('local', 20), ('synchronous', 16), ('bureaucratic', 14)]
['pretentious', 'english', 'bureaucratic', 'synchronous', 'local', 'machine', 'successful', 'direct', 'english', 'translation', 'new', 'whole', 'overall', 'project']


In [None]:
!pip install git+https://github.com/PrithivirajDamodaran/Styleformer.git
from styleformer import Styleformer
import torch
import warnings
warnings.filterwarnings("ignore")
sf = Styleformer(style = 0) 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/PrithivirajDamodaran/Styleformer.git
  Cloning https://github.com/PrithivirajDamodaran/Styleformer.git to /tmp/pip-req-build-_vack208
  Running command git clone --filter=blob:none --quiet https://github.com/PrithivirajDamodaran/Styleformer.git /tmp/pip-req-build-_vack208
  Resolved https://github.com/PrithivirajDamodaran/Styleformer.git to commit 02c9a7fd6798bf5bbbb04456e5068566d6caef55
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers (from styleformer==0.1)
  Downloading transformers-4.29.0-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece (from styleformer==0.1)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

Downloading (…)lve/main/config.json:   0%|          | 0.00/913 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Casual to Formal model loaded...


In [None]:
punctuation = list(string.punctuation)

In [None]:
def apply_all_rules(initial_sentence):
  ''' Takes an input sentence as a string, tokenizes it, applies a series of 
  rules, and returns the detokenized sentence as a string.
  Example input: 'hello world'
  Example output: 'hello small world'
  '''
  formal_sentence = sf.transfer(initial_sentence)
  if formal_sentence is None: # unable to transfer style
    formal_sentence = initial_sentence
  tokenized_sentence = nltk.word_tokenize(formal_sentence)
  print("Formal: " + str(tokenized_sentence))
  substituted = synonyms(tokenized_sentence)
  print("Synonyms: " + str(substituted))
  longer = insert_adjs(insert_advs(tokenized_sentence, 1), 1) # ??? one (or both) of these functions directly modifies its input
  print("Additions: " + str(longer))
  og = nltk.word_tokenize(formal_sentence) # so retokenization is needed to recover the original
  og_counter = 0
  for i in range(len(longer)):
    if longer[i] == og[og_counter]:
      longer[i] = substituted[og_counter]
      og_counter += 1
    else:
      longer[i] = longer[i]
  output = TreebankWordDetokenizer().detokenize(longer)
  if initial_sentence[-1] in punctuation and initial_sentence[-1] != output[-1]: # add final punctuation if lost
    output += initial_sentence[-1]
  output = output.replace("_", " ")
  return output


In [None]:
apply_all_rules("Racism is bad for society")

Formal: ['Racism', 'is', 'bad', 'for', 'society', '.']
Synonyms: ['Racial_discrimination', 'is', 'uncollectible', 'for', 'high_society', '.']
before []
after []
['Racism', 'is', 'bad', 'for', 'society', '.']
[('civil', 228)]
[]
['Racism', 'is', 'bad', 'for', 'civil', 'society', '.']
Additions: ['Racism', 'is', 'bad', 'for', 'civil', 'society', '.']


'Racial discrimination is uncollectible for civil high society.'

In [None]:
apply_all_rules("Programming is only difficult if you don't have a computer.")

Formal: ['Programming', 'is', 'only', 'difficult', 'if', 'you', 'do', 'not', 'have', 'a', 'computer', '.']
Synonyms: ['Computer_programming', 'is', 'only', 'unmanageable', 'if', 'you', 'do', 'not', 'have', 'a', 'information_processing_system', '.']
before []
after []
before []
after []
before []
after []
['Programming', 'is', 'only', 'difficult', 'if', 'you', 'do', 'not', 'have', 'a', 'computer', '.']
[('personal', 22)]
[]
['Programming', 'is', 'only', 'difficult', 'if', 'you', 'do', 'not', 'have', 'a', 'personal', 'computer', '.']
Additions: ['Programming', 'is', 'only', 'difficult', 'if', 'you', 'do', 'not', 'have', 'a', 'personal', 'computer', '.']


'Computer programming is only unmanageable if you do not have a personal information processing system.'

In [None]:
test_sentences = ["Hello world!",
                  "For our project, we tried to make pretentious English text.",
                  "You come here often?",
                  "On Monday he ate through one apple, but he was still hungry.",
                  "Hey dude, check out this sick program.",
                  "Racism is bad for society.",
                  "What is your favorite class?",
                  "Trash harms the environment.",
                  "Students are united by their love of coffee.",
                  "Programming is only difficult if you don't have a computer."]

In [None]:
for sentence in test_sentences:
  print("Input: " + sentence)
  print("Output: " + apply_all_rules(sentence))


Input: Hello world!
Formal: ['Hello', 'world', '.']
Synonyms: ['How-do-you-do', 'worldly_concern', '.']
['Hello', 'world', '.']
[('real', 318)]
['Hello', 'real', 'world', '.']
Additions: ['Hello', 'real', 'world', '.']
Output: How-do-you-do real worldly concern.!
Input: For our project, we tried to make pretentious English text.
Formal: ['We', 'tried', 'to', 'make', 'pretentious', 'English', 'text', 'for', 'our', 'project', '.']
Synonyms: ['We', 'examined', 'to', 'construct', 'pretentious', 'English_language', 'textual_matter', 'for', 'our', 'undertaking', '.']
before [('also', 74)]
after []
before [('also', 30)]
after []
['We', 'also', 'tried', 'to', 'also', 'make', 'pretentious', 'English', 'text', 'for', 'our', 'project', '.']
[('overall', 42)]
[('original', 28)]
['We', 'also', 'tried', 'to', 'also', 'make', 'pretentious', 'English', 'original', 'text', 'for', 'our', 'overall', 'project', '.']
Additions: ['We', 'also', 'tried', 'to', 'also', 'make', 'pretentious', 'English', 'origin

In [None]:
apply_all_rules("You come here often?")

Formal: ['Do', 'you', 'frequent', 'this', 'location', '?']
Synonyms: ['Do', 'you', 'frequent', 'this', 'localization', '?']
before [('less', 22)]
after []
before []
after []
['Do', 'you', 'less', 'frequent', 'this', 'location', '?']
[('geographical', 42)]
['Do', 'you', 'less', 'frequent', 'this', 'geographical', 'location', '?']
Additions: ['Do', 'you', 'less', 'frequent', 'this', 'geographical', 'location', '?']


'Do you less frequent this geographical localization?'

## Appendix ##
#### Below is the code for the original approach to informal to formal style conversion. ####

In [None]:
!pip install wget

!wget -q -O informal.txt "https://drive.google.com/uc?export=download&id=1zFj65fAZoHlh6xoGcWf1Oa0Yg6ijFw-x"
informal_sentences = []
with open('informal.txt') as f:
  for line in f:
    informal = line.strip()
    informal_sentences.append(informal)
print(informal_sentences[:10])
print(informal_sentences[-10:])

!wget -q -O formal.txt "https://drive.google.com/uc?export=download&id=1L4nxZGzAfwF9LDyalMtwQ1GIP3kxniX5"
formal_sentences = []
with open('formal.txt') as f:
  for line in f:
    formal = line.strip()
    formal_sentences.append(formal)
print(formal_sentences[:10])
print(formal_sentences[-10:])

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
['the movie The In-Laws not exactly a holiday movie but funny and good!', 'that page did not give me viroses(i think)', 'of corse i be wachin it evry day, my fav charachter is Inuasha', 'runescape.com (my kids love it) & funbrain.com  (educational)', "Is he gay?He was on Late Night with Conan O'Brien and he seemed pretty gay", 'id have to say mel gibson, a strong [god] beliver!!!', "i don't know.and my exams r not over yet still 1 remaining i.e on 23-dec.", 'Spy Kids 3D: Game Over 20.', 'because his mom was a wafer so long', 'Hope that helps or am I entirely off here?']
['Drown because i tried it before', 'its a country in central america', 'i havent seen it in a while, but i remember it being pretty good, but a little predictable.', "i think that it's ideal song to get in the mood!", "I never watch the show but I am afraid she will win because she's a Scientologist!", 'Amzing,Hilarouus p

In [None]:
!python -m spacy download en_core_web_lg
import spacy
nlp = spacy.load("en_core_web_lg")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
# learn distances between informal and formal sentences
distance = 0
for informal_sentence, formal_sentence in zip(informal_sentences, formal_sentences):
  distance += nlp(informal_sentence).vector - nlp(formal_sentence).vector 
distance /= len(informal_sentences)  
print(distance)

[ 3.87695074e-01 -2.39695664e-02  6.45375028e-02  3.20608728e-02
 -9.56575274e-01 -8.37302059e-02  2.59907007e-01 -3.05819184e-01
 -3.77694070e-02  5.03544835e-03 -1.19215131e+00  2.00598631e-02
  2.06226498e-01  2.62704134e-01 -3.78506668e-02 -6.25278592e-01
  2.98962265e-01  3.14114034e-01  1.90489426e-01 -1.87063590e-01
 -2.41422981e-01  2.42167369e-01 -2.47039020e-01  7.77992606e-02
  8.97524729e-02  2.09371492e-01  2.98010617e-01  2.61020064e-01
  2.26573981e-02  1.46727994e-01 -5.38769970e-03  1.37568831e-01
  6.19835854e-01  1.40120015e-01  1.01682651e+00 -3.28068845e-02
  2.03550056e-01 -2.70195622e-02  4.32173237e-02  7.07043856e-02
 -7.95390923e-03 -4.29806143e-01  4.02283520e-01 -3.95629525e-01
  5.93028843e-01 -2.91225076e-01 -4.20287728e-01  1.97732612e-01
 -7.76826292e-02  8.35809112e-02  3.74678999e-01 -5.82646549e-01
 -3.07717502e-01  1.05112827e+00 -4.73160684e-01  2.65375227e-01
  1.86345503e-01  2.48914868e-01  4.39518616e-02  3.21789496e-02
  1.70126215e-01  3.37998

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y) # from Practicum 5

In [None]:
def style_transfer(informal_sent, formal_corpus):
    # find the most similar formal sentences in the formal corpus
    best_similarity = 0.75
    best_formal_sentences = []
    for formal_sent in formal_corpus:
      similarity = calculate_similarity(informal_sent, formal_sent)
      if similarity > best_similarity:
          best_similarity = similarity
          best_formal_sentences.append(formal_sent)
          print(best_similarity)
    best_formal_sent = " ".join(best_formal_sentences)
    print(best_formal_sent)

    # create a dictionary of possible substitutions
    word_sub_dict = {}
    for word in informal_sent.split():
        formal_word = find_best_word(word, best_formal_sent)
        if formal_word:
            word_sub_dict[word] = formal_word
        else:
            word_sub_dict[word] = word
    print(word_sub_dict)
    # replace informal words in the input sentence with possible formal substitutions
    formal_sent = " ".join([word_sub_dict.get(word, word) for word in informal_sent.split()])

    return formal_sent


def calculate_similarity(informal, formal):
    informal_vec = nlp(informal).vector
    formal_vec = nlp(formal).vector
    if sum(informal_vec) == 0 or sum(formal_vec) == 0: # words without embeddings
      return 0
    else:
      goal_formal = informal_vec + distance
      return cosine_similarity(formal_vec, goal_formal)


def find_best_word(word, formal_sent):
    # find the word in the given formal sentence that is most similar to the given word
    best_similarity = 0.6
    best_formal_word = None
    for c in formal_sent:
      if c in punctuation:
          formal_sent = formal_sent.replace(c, "")      
    for formal_word in formal_sent.split():
        if word != formal_word:
          similarity = calculate_similarity(word, formal_word)
          # similarity = nlp(word).similarity(nlp(formal_word))
          if similarity > best_similarity:
              best_similarity = similarity
              best_formal_word = formal_word
    print(word)
    print(best_formal_word)
    print(best_similarity)
    return best_formal_word

In [None]:
import random

In [None]:
style_transfer("We tried to make pretentious English text for our project", random.sample(formal_sentences, 5000)) # going through entire corpus takes too long

0.7953561544418335
0.7989491820335388
0.8261393308639526
0.8528211116790771
0.8996766805648804
I can not wait for Clark and Lana to really be together. Your enthusiasm caused me to search for your organization. I agree and wish for you to join me. We do not like to read subtitles. We want to make spectacular movies.
We
I
0.614783525466919
tried
None
0.6
to
None
0.6
make
None
0.6
pretentious
really
0.6136991381645203
English
None
0.6
text
None
0.6
for
None
0.6
our
your
0.6954249143600464
project
None
0.6
{'We': 'I', 'tried': 'tried', 'to': 'to', 'make': 'make', 'pretentious': 'really', 'English': 'English', 'text': 'text', 'for': 'for', 'our': 'your', 'project': 'project'}


'I tried to make really English text for your project'