# Install requirements

In [None]:
! pip install pandas
! pip install transformers datasets evaluate rouge_score
! pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

## Import requirements

In [None]:
import pandas as pd
import csv
import json
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os


%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


# Load dataset

In [None]:
df = pd.read_csv('booksummaries.csv', delimiter='\t' , names=["Path", "Title", "Author","Date", "Info", "Text"])

Check missing values

In [None]:
df.info()

# Clean Data

### Our project primarily relies on the 'Text' column, which is free from NaN values. To maintain data completeness, any missing values in other columns are replaced with designated values rather than being removed.

In [None]:
df.head()

In [None]:
df['Author'] = df['Author'].fillna("unknown")
df['Date'] = df['Date'].apply(lambda x: int(x[:4]) if isinstance(x, str) and x[:4].isdigit() else -1)
df['Info'] = df['Info'].fillna("{}")
df = df.reset_index(drop = True)



In [41]:
df.iloc[12154]['Title'] 

'A.D.: New Orleans After the Deluge'

In [None]:
df.info()

## Explore the dataset's genre distribution. Show the top 10 most frequent genres visually.


In [None]:
genres = {}
info = df.Info.to_list()
for i in range(len(info)):
    tmp = json.loads(info[i])
    for _, value in tmp.items():
        if value in genres:
            genres[value] += 1
        else:
            genres[value] = 1



In [None]:
sorted_genres = dict(sorted(genres.items(), key=lambda x: x[1], reverse=True))
top_10_genres = dict(list(sorted_genres.items())[:10])

plt.figure(figsize=(16, 9))

palette = sns.color_palette("tab10")

sns.barplot(x=list(top_10_genres.keys()), y=list(top_10_genres.values()), palette=palette)


plt.title('Genre Count', fontsize=18)
plt.xlabel('Genres', fontsize=14)
plt.ylabel('Count', fontsize=14)

plt.xticks(fontsize=10)
plt.yticks(fontsize=12)

plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.show()

In [None]:
year = df[df['Date'] > 1800]['Date'].tolist()
genre = df[df['Date'] > 1800]['Info'].tolist()
years = []
genras = []
for i in range(len(genre)):
    tmp = json.loads(genre[i])
    for _, value in tmp.items():
        if value in ['Science Fiction', 'Crime Fiction', 'Speculative fiction', 'Horror']:
            years.append(year[i])
            genras.append(value)






sns.set_style("whitegrid")
sns.set_palette("viridis")

sns.kdeplot(x=years , hue = genras, shade=True)




plt.xlabel('Year')
plt.ylabel('Density')
plt.title('Density Plot of Years')
plt.legend(genras, loc='upper left', title="Genre")
plt.title('Density Plot of Years', fontsize=16)

plt.grid(True, linestyle='--', alpha=0.7)

plt.show()


In [71]:
import torch
from transformers import pipeline

summarizer = pipeline(
    "summarization",
    "pszemraj/long-t5-tglobal-base-16384-book-summary",
    device=0 if torch.cuda.is_available() else -1,
)

long_text = "Here is a lot of text I don't want to read. Replace me"

result = summarizer(long_text)
print(result[0]["summary_text"])

Your max_length is set to 512, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


This is a very long list of text I do not want to read, replace me with you. Replace me.


In [None]:
# Create an empty list to store the summaries
summaries = []

# Use tqdm to iterate over the rows of the DataFrame
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    # Apply summarization function to the 'Text' column of each row
    summary = summarizer(row["Text"])
    summaries.append(summary)
    if (index) % 5 == 0:
        torch.cuda.empty_cache()



In [None]:
# import csv
# result_file = open("Summaries.csv",'wb')
# with open('output2.csv','w') as result_file:
#     wr = csv.writer(result_file, dialect='excel')
#     wr.writerows(summaries)

In [None]:

Summaries = pd.read_csv('Summaries.csv' , names=['Summaries'])
Summaries.head(5)


In [None]:
summaries = Summaries['Summaries'].to_list()

In [None]:
# import csv

# # Assuming you have a list named 'text_list' containing the text values

# # Specify the file path where you want to save the CSV file
# csv_file_path = 'Summaries.csv'

# # Open the CSV file in 'w' mode (write mode)
# with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
#     # Create a CSV writer object
#     csv_writer = csv.writer(csv_file)
    
#     # Write each string from the text_list to the CSV file
#     for text in a1:
#         csv_writer.writerow([text])

# print("CSV file saved successfully!")

In [None]:
for i in range(len(summaries)):
    summaries[i] = json.loads(summaries[i]
                              .replace('\\', '')
                              .replace('"', '\\"')
                              .replace('{\'summary_text\': \'', '{"summary_text": "')
                              .replace('{\'summary_text\': \\"', '{"summary_text": "')
                              .replace('{\\"summary_text\\": \\"', '{"summary_text": "')
                              .replace('\'}', '"}')
                              .replace('\\\"}', '"}')
                              )['summary_text']
    # Summaries[i] = Summaries[i].replace('summary_text:', '')



In [None]:
df['Summaries'] = summaries
df.head(5)

# Text-to-Image 

In [59]:
from diffusers import StableDiffusionPipeline
import torch

model_id = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")






Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

SyntaxError: invalid syntax (3633746770.py, line 1)

"Mort is a teenager who has a personality that makes him unsuited for the family farm. Lezek takes him to an apprenticeship fair hoping that he will find a job as an undertaker. When he finds one, he accepts and becomes Death's apprentice. When it is time to die, Mort saves the princess from death. Both she and Mort consult Igneouscutwell for assistance. As Mort becomes more like Death, reverting to his former self, tries to feel human emotion. The duel ends in a duel between Mort and Death"

In [53]:
df.iloc[84]['Summaries']

'Sir Charles is found dead in his country house. He has a heartattack, and Mortimer goes to London to ask Holmes for assistance. The Baskerville household is plagued with a curse that causes Hugo\'s daughter to escape from him. When she escapes, he offers his soul to "the devil" if the monster can rescue her. Sir Henry arrives from America and receives an anonymous note telling him to avoid Devon moorlands. A new boot is stolen by Sir Henry, but it does not explain why. They find another one missing. Sir John returns to England and tells Holmes and Watson about the disappearance of two more boots. On the way back, they meet Jack Staplton, a local naturalist who has been living on the area for over two years. Beryl warns him against leaving the area because Selden is being pursued. Barrymore also tells Watson that Selden was once married to Mrs. Baronet, but now he refuses to marry her. Meanwhile, Sir Henry keeps trying to get Beryl to agree to help him divorce her sister. Finally, Fran

In [58]:
prompt = df.iloc[85]['Summaries']
image = pipe(prompt).images[0]  

image.save("test.png")

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ["piece of property that thorin used to hold. he refuses to give up his claim to the treasure because he doesn 't want to fight another war. but when gandalsf warns everyone about an army approaching, they manage to win the battle without any trouble."]


  0%|          | 0/50 [00:00<?, ?it/s]

TypeError: 'bool' object is not iterable

In [None]:
df.index[0] 

In [None]:
photo_names = []

df2 = df.iloc[13645:, :]

for index, row in tqdm(df2.iterrows(), total=len(df2), desc="Processing rows"):
    name = str(index)+'_'+str(row['Title'])+'.png'
    prompt = row['Summaries']
    photo = pipe(prompt).images[0]
    try:
        photo.save(f'images/{name}') 
    except:
        photo.save(str(index)+'.png')
    photo_names.append(name)
    if (index) % 5 == 0:
        torch.cuda.empty_cache()




## mistakes correction

### some files didn't saved properly 

In [None]:

arr = os.listdir('images/')

for i, value in tqdm(enumerate(arr), total=len(arr)):
    if value.endswith('.png'):
        continue

    else:
        try:
            os.remove(f"images/{value}")
            name = value + '.png'
            index = int(value.split('_')[0])
            prompt = df.loc[index]['Summaries']
            photo = pipe(prompt).images[0]
            try:
                photo.save(f'images/{name}') 
            except:
                photo.save("images/" + str(index)+'.png')
            # photo_names.append(name)
            if (i) % 5 == 0:
                torch.cuda.empty_cache()
        except:
            continue



### a function for finding for black images

In [36]:
from PIL import Image

def is_image_all_black(image_path):
    img = Image.open(image_path)
    img = img.convert('L')
    pixels = img.getdata()  
    return all(pixel == 0 for pixel in pixels)




The image is not completely black.


### remove black images and try to replace them

In [106]:
arr = os.listdir('images/')

for i, value in tqdm(enumerate(arr), total=len(arr)):
    try:

        if is_image_all_black(f'images/{value}'):
                os.remove(f"images/{value}")
                # name = value
                # index = int(value.split('_')[0])
                # prompt = df.loc[index]['Summaries']
                # photo = pipe(prompt).images[0]
                # try:
                #     photo.save(f'images/{name}') 
                # except:
                #     photo.save("images/" + str(index)+'.png')
                # if (i) % 5 == 0:
                #     torch.cuda.empty_cache()

    except:
        continue



100%|██████████| 16559/16559 [03:18<00:00, 83.48it/s]


### double summerise for bypassing the rules


In [109]:
arr = os.listdir('images/')

index_list = []
for i in arr:
    try:
        index_list.append(int(i.split('_')[0]))
    except:
        index_list.append(int(i.split('.')[0]))


In [110]:
for i in range(len(df)):
    if not i in index_list:
        print(f"------------------------------------------------------- {i} ---------------------------------------------------")
        prompt = df.iloc[i]['Summaries']
        prompt = summarizer(prompt)[0]['summary_text']
        photo = pipe(prompt).images[0]
        photo.save(f"{i}.png")

        if (i) % 3 == 0:
            torch.cuda.empty_cache()
        



Your max_length is set to 512, but your input_length is only 52. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)


------------------------------------------------------- 10878 ---------------------------------------------------


Your max_length is set to 512, but your input_length is only 39. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
Your max_length is set to 512, but your input_length is only 35. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 512, but your input_length is only 84. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)


------------------------------------------------------- 11697 ---------------------------------------------------


Your max_length is set to 512, but your input_length is only 28. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 512, but your input_length is only 17. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)


  0%|          | 0/50 [00:00<?, ?it/s]