### Model part

In [131]:
import numpy as np
import pandas as pd
import markovify
import re
import spacy

In [88]:
dataset_name = 'data/train.csv'

In [133]:
dataset_name = 'data/for_human_evaluation.csv'

In [217]:
dataset_name = 'data/test.csv'

In [134]:
data_df = pd.read_csv(dataset_name)

In [135]:
data_df.head()

Unnamed: 0.1,Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,50000,ef7bd51e-1e5a-491b-8556-afbe91843f0b,Bride,The bride smiled at the groom.,She was missing a tooth.,The groom thought that was cute.,He remembered that moment.,He shared it at her funeral.
1,50001,0122689a-f111-4aca-913d-ede53e1c9e67,Shaving Accident,Jake was running late for work.,He tried to shave in a hurry.,Unfortunately he cut himself.,Jake never really noticed.,Someone pointed out the blood on his face.
2,50002,4e8124fa-85b1-47c8-abfd-2dd0ae9b9bff,Ice cream factory,Nancy went to a local ice cream factory.,She ordered two containers of ice cream for he...,They sat down in front of the school.,Her son devoured all of his ice cream.,"Once they were finished, they went home."
3,50003,e709eaa9-5c00-43b8-8d9e-bcb7f5a6c694,Keyboard,I had a nearly new keyboard.,I decided to try to sell it on Amazon.,I got a sales notification and packed the item...,"Minutes later, I got a cancellation request.",I was very disappointed.
4,50004,5d457e94-3107-4cbf-aba9-2ad781eed525,Out of Water,Jane was on a long hike.,Before it was over she ran out of water.,She thought she had packed more.,She started to worry about.,Luckily she made it back safe.


In [136]:
data_df.shape

(50, 8)

In [137]:
feat_col = ['sentence1', 'sentence2', 'sentence3', 'sentence4']

In [138]:
def combine_sentence(df, feat_col):
    df['sentence'] = df[feat_col].agg(' '.join, axis=1)
    return df

In [139]:
def predict_sentence(markov_model, start_word, expected_chars, tries=50):
    min_chars = expected_chars
    max_chars = expected_chars+100
    try:
        for _ in range(tries):
            sentence = markov_model.make_sentence_with_start(start_word)
            if sentence and min_chars <= len(sentence) <= max_chars:
                return sentence 
    except:
        return markov_model.make_short_sentence(max_chars, tries=tries)

In [140]:
data_df = combine_sentence(data_df, feat_col)

In [141]:
data_df['target_length'] = data_df['sentence5'].str.len()

In [142]:
data_df.head(10)

Unnamed: 0.1,Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5,sentence,target_length
0,50000,ef7bd51e-1e5a-491b-8556-afbe91843f0b,Bride,The bride smiled at the groom.,She was missing a tooth.,The groom thought that was cute.,He remembered that moment.,He shared it at her funeral.,The bride smiled at the groom. She was missing...,28
1,50001,0122689a-f111-4aca-913d-ede53e1c9e67,Shaving Accident,Jake was running late for work.,He tried to shave in a hurry.,Unfortunately he cut himself.,Jake never really noticed.,Someone pointed out the blood on his face.,Jake was running late for work. He tried to sh...,42
2,50002,4e8124fa-85b1-47c8-abfd-2dd0ae9b9bff,Ice cream factory,Nancy went to a local ice cream factory.,She ordered two containers of ice cream for he...,They sat down in front of the school.,Her son devoured all of his ice cream.,"Once they were finished, they went home.",Nancy went to a local ice cream factory. She o...,40
3,50003,e709eaa9-5c00-43b8-8d9e-bcb7f5a6c694,Keyboard,I had a nearly new keyboard.,I decided to try to sell it on Amazon.,I got a sales notification and packed the item...,"Minutes later, I got a cancellation request.",I was very disappointed.,I had a nearly new keyboard. I decided to try ...,24
4,50004,5d457e94-3107-4cbf-aba9-2ad781eed525,Out of Water,Jane was on a long hike.,Before it was over she ran out of water.,She thought she had packed more.,She started to worry about.,Luckily she made it back safe.,Jane was on a long hike. Before it was over sh...,30
5,50005,494b9ed6-ae7f-4902-9ff3-9d6de0d07e91,Flexibility,Jane was never very flexible.,She decided to change that.,Jane started doing yoga.,It was slow and hard.,Eventually she became more flexible.,Jane was never very flexible. She decided to c...,36
6,50006,fe6aa933-add3-4b7e-a17c-556a7941e626,Tired Pam,"When Pam went to work, she was tired.",She started up her computer and stared at the ...,She found herself dozing off from time.,Her boss didn't notice.,"During her lunch break, she took a nap.","When Pam went to work, she was tired. She star...",39
7,50007,20e2a1bb-a6d9-49b0-b79a-3c27a8603d64,An Unexpected Promotion,Stan worked at a mechanic shop.,One day he saw Tony taking home tools.,Stan told his boss about Tony.,The boss fired Tony for stealing property from...,Stan got Tony's position.,Stan worked at a mechanic shop. One day he saw...,25
8,50008,6a37515a-6d1c-4bb1-b75a-6ff32ab536cb,Abby's pool party,Summer was coming to an end and Abby wanted to...,She hadn't seen some of her friends in a while...,Abby decided on a pool party in her backyard.,All of Abby's school friends showed up and the...,Then they had BBQ and talked about their summe...,Summer was coming to an end and Abby wanted to...,67
9,50009,beb11f9a-4bc5-4df0-b5b7-4953f9f80fd0,Rodney gets into a fight,One evening while out at the bar.,Rodney has a few drinks and some nuts with his...,When Rodney was getting up to leave some man c...,Rodney asks the man what his problem was.,The man told him Rodney was the problem and th...,One evening while out at the bar. Rodney has a...,64


In [143]:
text = data_df['sentence'].str.cat(sep=' ')

In [144]:
start_word = data_df.loc[0,'sentence1'].split(' ')[0]

In [145]:
start_word

'He'

In [146]:
word_len = data_df.loc[0,'target_length']

In [147]:
word_len

28

### Test performance on POS markov model

In [11]:
import markovify
import nltk
import re

class POSifiedText(markovify.Text):
    def word_split(self, sentence):
        words = re.split(self.word_split_pattern, sentence)
        words = [ "::".join(tag) for tag in nltk.pos_tag(words) ]
        return words

    def word_join(self, words):
        sentence = " ".join(word.split("::")[0] for word in words)
        return sentence

In [7]:
import markovify
import re
import spacy

nlp = spacy.load("en_core_web_sm")

class POSifiedText(markovify.Text):
    def word_split(self, sentence):
        return ["::".join((word.orth_, word.pos_)) for word in nlp(sentence)]

    def word_join(self, words):
        sentence = " ".join(word.split("::")[0] for word in words)
        return sentence

2022-12-01 01:13:29.596630: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### change the state_size for better performance

In [103]:
text_model = markovify.Text(text, state_size=3)

In [199]:
print(text_model.make_sentence())

When he got it back even more broken.


In [200]:
print(text_model.make_sentence())

I realized I forgot to eat anything.


In [81]:
print(text_model.make_short_sentence(test_len))

David lost both of his savings on the front row.


In [161]:
data_df.loc[0,'sentence']

"David noticed he had put on a lot of weight recently. He examined his habits to try and figure out the reason. He realized he'd been eating too much fast food lately. He stopped going to burger places and started a vegetarian diet."

In [162]:
data_df.loc[0,'sentence5']

'After a few weeks, he started to feel much better.'

In [155]:
print(text_model.make_sentence_with_start('David', strict=True))

David has to buy one.


In [196]:
print(text_model.make_sentence_with_start('After', strict=True))

After many outfit changes, she decided to organize a sports game with friends.


In [115]:
print(text_model.make_sentence_with_start('David'))

David lost both of his hard work, he found a movie from one of my desk, trembling.


In [116]:
data_df['sentence1'].str.split(' ').str[0]

0         David
1           Tom
2        Marcus
3         Bobby
4          John
          ...  
52660       The
52661     After
52662    Janice
52663     Jamie
52664       The
Name: sentence1, Length: 52665, dtype: object

In [84]:
data_df['sentence5'].str.split(' ').str[0]

0         After
1           Tom
2        Marcus
3            He
4           His
          ...  
52660       The
52661         I
52662       She
52663     Jamie
52664      That
Name: sentence5, Length: 52665, dtype: object

In [22]:
(data_df['sentence1'].str.split(' ').str[0]==data_df['sentence5'].str.split(' ').str[0]).sum()

14860

In [23]:
(data_df['sentence2'].str.split(' ').str[0]==data_df['sentence5'].str.split(' ').str[0]).sum()

11348

In [24]:
(data_df['sentence3'].str.split(' ').str[0]==data_df['sentence5'].str.split(' ').str[0]).sum()

11932

In [25]:
(data_df['sentence4'].str.split(' ').str[0]==data_df['sentence5'].str.split(' ').str[0]).sum()

8730

In [48]:
predict_sentence(text_model, start_word, word_len)

'David was extremely happy to be a farm at night.'

In [148]:
data_df['predict_sentence'] = np.nan

In [149]:
for idx, row in data_df.iterrows():
# for idx, row in data_df.iloc[0:20].iterrows():
#     print(row)
    rand = np.random.randint(0,9)
    if rand>=5:
        start_word = row['sentence1'].split(' ')[0]
    else:
        start_word = row['sentence2'].split(' ')[0]
    word_len = row['target_length']
    predict_s = predict_sentence(text_model, start_word, word_len)
#     print(f'{idx} {predict_s}')
    print(f'round: {idx}')
    data_df.loc[idx,'predict_sentence'] = predict_s

round: 0
round: 1
round: 2
round: 3
round: 4
round: 5
round: 6
round: 7
round: 8
round: 9
round: 10
round: 11
round: 12
round: 13
round: 14
round: 15
round: 16
round: 17
round: 18
round: 19
round: 20
round: 21
round: 22
round: 23
round: 24
round: 25
round: 26
round: 27
round: 28
round: 29
round: 30
round: 31
round: 32
round: 33
round: 34
round: 35
round: 36
round: 37
round: 38
round: 39
round: 40
round: 41
round: 42
round: 43
round: 44
round: 45
round: 46
round: 47
round: 48
round: 49


In [150]:
data_df

Unnamed: 0.1,Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5,sentence,target_length,predict_sentence
0,50000,ef7bd51e-1e5a-491b-8556-afbe91843f0b,Bride,The bride smiled at the groom.,She was missing a tooth.,The groom thought that was cute.,He remembered that moment.,He shared it at her funeral.,The bride smiled at the groom. She was missing...,28,He tried out for the right racquet.
1,50001,0122689a-f111-4aca-913d-ede53e1c9e67,Shaving Accident,Jake was running late for work.,He tried to shave in a hurry.,Unfortunately he cut himself.,Jake never really noticed.,Someone pointed out the blood on his face.,Jake was running late for work. He tried to sh...,42,Jake was very nervous because she had never be...
2,50002,4e8124fa-85b1-47c8-abfd-2dd0ae9b9bff,Ice cream factory,Nancy went to a local ice cream factory.,She ordered two containers of ice cream for he...,They sat down in front of the school.,Her son devoured all of his ice cream.,"Once they were finished, they went home.",Nancy went to a local ice cream factory. She o...,40,"Once all of the clutter, but it still looked f..."
3,50003,e709eaa9-5c00-43b8-8d9e-bcb7f5a6c694,Keyboard,I had a nearly new keyboard.,I decided to try to sell it on Amazon.,I got a sales notification and packed the item...,"Minutes later, I got a cancellation request.",I was very disappointed.,I had a nearly new keyboard. I decided to try ...,24,I decided to use the restroom.
4,50004,5d457e94-3107-4cbf-aba9-2ad781eed525,Out of Water,Jane was on a long hike.,Before it was over she ran out of water.,She thought she had packed more.,She started to worry about.,Luckily she made it back safe.,Jane was on a long hike. Before it was over sh...,30,Luckily her brother bought her a cute cardboar...
5,50005,494b9ed6-ae7f-4902-9ff3-9d6de0d07e91,Flexibility,Jane was never very flexible.,She decided to change that.,Jane started doing yoga.,It was slow and hard.,Eventually she became more flexible.,Jane was never very flexible. She decided to c...,36,Eventually there was a puppy in the woods.
6,50006,fe6aa933-add3-4b7e-a17c-556a7941e626,Tired Pam,"When Pam went to work, she was tired.",She started up her computer and stared at the ...,She found herself dozing off from time.,Her boss didn't notice.,"During her lunch break, she took a nap.","When Pam went to work, she was tired. She star...",39,During the pageant she noticed that she had to...
7,50007,20e2a1bb-a6d9-49b0-b79a-3c27a8603d64,An Unexpected Promotion,Stan worked at a mechanic shop.,One day he saw Tony taking home tools.,Stan told his boss about Tony.,The boss fired Tony for stealing property from...,Stan got Tony's position.,Stan worked at a mechanic shop. One day he saw...,25,Stan had just gotten engaged.
8,50008,6a37515a-6d1c-4bb1-b75a-6ff32ab536cb,Abby's pool party,Summer was coming to an end and Abby wanted to...,She hadn't seen some of her friends in a while...,Abby decided on a pool party in her backyard.,All of Abby's school friends showed up and the...,Then they had BBQ and talked about their summe...,Summer was coming to an end and Abby wanted to...,67,Then one day he spilled a gallon of sour milk ...
9,50009,beb11f9a-4bc5-4df0-b5b7-4953f9f80fd0,Rodney gets into a fight,One evening while out at the bar.,Rodney has a few drinks and some nuts with his...,When Rodney was getting up to leave some man c...,Rodney asks the man what his problem was.,The man told him Rodney was the problem and th...,One evening while out at the bar. Rodney has a...,64,"The washing machine, and Stephanie, were in th..."


In [151]:
data_df.shape

(50, 11)

In [152]:
data_df['predict_sentence'].isnull().sum()

0

In [153]:
data_df.drop('Unnamed: 0', axis=1, inplace=True)

In [154]:
data_df.to_csv(f"""predicted_{dataset_name.split('data/')[1]}""", index=False)

### Evaluation

In [65]:
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu

In [273]:
df = pd.read_csv('predicted_train.csv')

In [66]:
df = pd.read_csv('predicted_test.csv')

In [240]:
score_result = []
for ind, row in df.iterrows():
    if pd.isnull(row['predict_sentence']):
        continue
    score = sentence_bleu(row['sentence5'].split(), row['predict_sentence'].split())
    score_result.append(score)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [241]:
score_result = np.mean(np.array(score_result))

In [67]:
import json
from rouge import Rouge

In [68]:
df.shape

(5000, 10)

In [72]:
df.dropna(subset=['predict_sentence'], inplace=True)

In [49]:
df['sentence5'].tolist()

['was able to get a new pizza delivered to his home.',
 'instead went out to buy new knives.',
 'took awhile, but she finally fell asleep.',
 'took a big refreshing sip.',
 'the vegetables came out healthy and delicious!',
 'that incident Imelda de-emphasized her detail-orientation.',
 "He only succeeded in convincing his brother it didn't work.",
 'Tom felt silly and made sure to log out more often.',
 'He decided to get it once the bus arrived to his stop.',
 'had to go get staples in her leg.',
 'doctor prescribed medicine to help Jim with his sleep problem.',
 'Nick rides the school bus all afternoon.',
 'other customer noticed, apologized and finished buyin.',
 'refrained from calling this person a hypocrite.',
 'the tight glove cramped his hand and he got a strike.',
 'were pleased.',
 'earned enough money to buy new clothes.',
 'the cats come up to Jane every day and let her pet them.',
 'He was so nice that he bought me earplugs to wear at night.',
 'was enough to help him find

In [50]:
df['predict_sentence'].tolist()

["did it often about twice a week and did all her family's taxes.",
 'pulled over to help me concentrate.',
 'was a fun trip, but I got into my sleeping bag and some food.',
 'finally emailed the leaders to ask them to turn it down.',
 'of a sudden, a baby deer went up to her and even the pastor was kind!',
 'about an hour, I gave up and asked her what color her hair was.',
 'He tried to use an ice pack for a few minutes it started to snow.',
 'Tom was in a diving competition when I was there I bought a couch.',
 'He greeted an old man with a longer nose and married him.',
 "didn't want to try this coffee beer.",
 'stuck out his tongue.',
 'as adults, the two have been dating for seven years.',
 'wedding director said that no one tried any of her tests.',
 'was going to call the cable company for a dozen years.',
 'it struck her in the afternoon so she had no experience in anything related to working on houses.',
 'were both going to go to camp with her.',
 'went outside to his car and

In [73]:
rouge = Rouge()

In [16]:
df['sentence5'].str.split().str[1:].str.join(' ')

0       was able to get a new pizza delivered to his h...
1                     instead went out to buy new knives.
2               took awhile, but she finally fell asleep.
3                              took a big refreshing sip.
4          the vegetables came out healthy and delicious!
                              ...                        
4995                            loved every minute of it.
4996             can always recommend which shows to see.
4997                          was easier and paid better.
4998            had gone to one that looked very similar.
4999    it was time for dinner he didn't want to get out.
Name: sentence5, Length: 4999, dtype: object

In [17]:
df['predict_sentence'].str.split().str[1:].str.join(' ')

0       did it often about twice a week and did all he...
1                     pulled over to help me concentrate.
2       was a fun trip, but I got into my sleeping bag...
3       finally emailed the leaders to ask them to tur...
4       of a sudden, a baby deer went up to her and ev...
                              ...                        
4995          was able to convince them it would be nice.
4996    went to his friends house to borrow a copy fro...
4997                     still didn't work for very long.
4998    was texting one of her cousins stood and began...
4999    I went inside my room and i don't even like th...
Name: predict_sentence, Length: 4999, dtype: object

In [74]:
scores = rouge.get_scores(df['sentence5'].tolist(), df['predict_sentence'].tolist(), avg=True)

In [None]:
# Test ROUGE
# {'rouge-1': {'r': 0.09447201062205202,
#   'p': 0.12982667613553704,
#   'f': 0.1066960745975188},
#  'rouge-2': {'r': 0.003937169302756953,
#   'p': 0.005613199345829914,
#   'f': 0.00452456430580215},
#  'rouge-l': {'r': 0.08839986112063807,
#   'p': 0.1214862170798006,
#   'f': 0.09970741546602394}}