# Code for scrapping data

In [1]:
import AO3
import pandas as pd
import numpy as np
import time

In [2]:
#Testing code snippet to grab titles and summaries to save in dataframe
url = "https://archiveofourown.org/works/11233080"
workid = AO3.utils.workid_from_url(url)
work = AO3.Work(workid)
title = work.title
summary = work.summary

In [2]:
titles = []
summaries = []

for n in range(20000, 20201):
    time.sleep(15)
    search = AO3.Search(single_chapter = True, word_count=AO3.utils.Constraint(2500, 30000), language = 'en', page = n)
    search.update()
    for result in search.results:
        titles.append(result.title)
        try:
            summaries.append(result.summary)
        except:
            summaries.append('No summary')

In [3]:
df = pd.DataFrame(zip(titles, summaries), columns = ['title', 'summary'])

In [4]:
df.to_csv('../data/frame_009', index = False)

In [2]:
df1 = pd.read_csv('../data/frame_001')
df2 = pd.read_csv('../data/frame_002')
df3 = pd.read_csv('../data/frame_003')
df4 = pd.read_csv('../data/frame_004')
df5 = pd.read_csv('../data/frame_005')
df6 = pd.read_csv('../data/frame_006')
df7 = pd.read_csv('../data/frame_007')
df8 = pd.read_csv('../data/frame_008')
df9 = pd.read_csv('../data/frame_009')

In [19]:
df1.rename(columns={'fic_title': 'title'}, inplace=True)

In [44]:
final = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9], ignore_index = True)

# Code for cleaning

## Check for duplicate rows

In [45]:
no_dupes = final[~final.duplicated()]

In [21]:
import re
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string

punct = list(string.punctuation)

def clean_summaries(text):
    #Get rid of \n at start and end of paragraph
    text = re.sub(r'^\\n|\\n$', '', text)
    #Capture groupings like \xa0
    text = re.sub(r'\\', r' ', text)
    text = re.sub(r'\\(\w+)', '', text)
    #Normalize whitespace
    text = re.sub("\s\s+", " ", text)
    text_token = word_tokenize(text)
    text_token = [w.lower() for w in text_token if w not in punct]
    text = TreebankWordDetokenizer().detokenize(text_token)
    return text

In [30]:
no_dupes.summary = no_dupes.summary.apply(clean_summaries)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_dupes.summary = no_dupes.summary.apply(clean_summaries)


In [39]:
lengths = no_dupes.apply(lambda x: len(x.summary), axis = 1)
no_dupes = no_dupes.assign(sum_length = lengths.values)

In [50]:
no_dupes.to_csv('../data/final_frame', index = False)

In [19]:
df = pd.read_csv('../data/final_frame')
final = df[~(df.summary == 'No summary')]

In [7]:
#Detect if English

In [22]:
final.summary = final.summary.apply(clean_summaries)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.summary = final.summary.apply(clean_summaries)


# Code to train NN to generate new titles/summaries

Merge all summaries + titles together to one long string
Base train length on average length of summary/title
- summary = 38

In [29]:
all_sum = ' '.join(final.summary)
all_sum = re.sub(r'\\', r'', all_sum)

In [None]:
train_len = 37 + 1

text_sequences = []
for i in range(train_len, len(tokens)):
    seq = tokens[i - trian_len:i]
    text_sequences.append(seq)

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenzier.fit_on_texts(text_sequences)

vocabulary_size = len(tokenizer.word_counts())

In [None]:
sequences = tokenizer.texts_to_sequences(text_sequences)
sequences = np.array(sequences)

Create X and y dataframes

In [None]:
from keras.utils import to_categorical

X = sequences[:, :-1]
y = sequences[:, -1]

y = to_categorical(y, num_classes = vocabulary_size + 1)
seq_len = X.shape[1]

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

neurons = train_len * 3
dense_neurons = train_len * 4

def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, seq_len, input_length = seq_len))
    model.add(LSTM(neurons, return_sequences = True))
    model.add(LSTM(neurons))
    model.add(Dense(dense_neurons, activation = 'relu'))
    
    model.add(Dense(vocabulary_size, activation = 'softmax'))
    
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    model.summary()
    
    return model

In [None]:
model = create_model(vocabulary_size + 1, seq_len)

In [None]:
from pickle import dump, load

model.fit(X, y, batch_size = 128, epochs = 50, verbose = 1)

In [None]:
model.save('fic_title_generator.h5')

In [None]:
dump(tokenizer, open('title_tokens', 'wb'))

## Generate new text

In [None]:
from keras.preprocessing.sequences import pad_sequences

#Make starter list for seed_text and choose random starter point

def generate_title(model, tokenizer, seq_len, seed_text, num_gen_words):
    
    output_text = []
    input_text = seed_text
    
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen = seq_len, truncating = 'pre')
        pred_word_idx = model.predict_classes(pad_encoded, verbose = 0)[0]
        pred_word = tokenzier.index_word[pred_word_idx]
        
        input_text += ' ' + pred_word
        output_text.append(pred_word)
    
    return " ".join(output_text)

In [None]:
from keras.models import load_model

model = load_model('fic_title_generator.h5')
tokenizer = load(open('title_tokens', 'rb'))