# Datasets Fusion

Firstly, organize two CSV files.

1. First read the CSV files of each of the datasets.
2. Extract important feature columns.
3. Organize the CSV file with the file name as a column and the comments as a column.
4. Save as a new CSV file.

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

from nltk.corpus import sentiwordnet as swn
from nltk.corpus import opinion_lexicon
from nltk.tag import pos_tag

from gensim import corpora
from gensim.models import LdaModel

ArtEmis Dataset

In [None]:
artwork_data = pd.read_csv("E:\\Project and Dissertation in Data Science\\dataset\\artemis_official_data\\merged_dataset2.csv")

artwork_data['integrated_comment'] = artwork_data.apply(lambda x: ['art style: ' + x.loc['art_style'], f"emotion: {x.loc['emotion']}", x.loc['utterance_x'], x.loc['utterance_y']] if pd.notnull(x['utterance_y']) else ['art style: ' + x.loc['art_style'], f"emotion: {x.loc['emotion']}", x.loc['utterance_x']], axis=1)

artwork_data[['painting','integrated_comment']].to_csv("E:\\Project and Dissertation in Data Science\\dataset\\artemis_official_data\\artwork_data.csv", index=False)

Extract emotional words in a new column and then save as a new CSV file.

In [None]:
# load stop words
nltk.download('stopwords')
# sentiment analysis
nltk.download('opinion_lexicon')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     D:\downloadSoftware\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     D:\downloadSoftware\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     D:\downloadSoftware\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     D:\downloadSoftware\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# pre-process text
def text_preprocess(text):
    text = text.lower()
    # tokenization
    tokens = word_tokenize(text)
    # remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # remove punctuation
    filtered_tokens = [word for word in filtered_tokens if word not in string.punctuation]
    ## part-of-speech tagging
    tagged_tokens = pos_tag(filtered_tokens)
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for word, tag in tagged_tokens:
        if tag.startswith('V'):  # Verb
            lemmatized_tokens.append(lemmatizer.lemmatize(word, pos='v'))
        elif tag.startswith('RB'):  # Adverb
            lemmatized_tokens.append(lemmatizer.lemmatize(word, pos='r'))
        else:
            lemmatized_tokens.append(word)

    return lemmatized_tokens

In [None]:
# sentiment analysis
# load AFINN sentiment dictionary
afinn = opinion_lexicon.words()
def extract_emotional_words(tokens):
    emotional_words = set()
    # tag the word part of speech
    tagged_tokens = pos_tag(tokens)

    for token, tag in tagged_tokens:
        if tag.startswith('JJ') or tag.startswith('RB') or tag.startswith('NN') or tag.startswith('VB'):
            if token in afinn:
                emotional_words.add(token)

    return emotional_words

In [None]:
artwork_text = artwork_data['utterance']
emotional_words_column = []
for text in artwork_text:
    preprocessed_text = text_preprocess(text)
    emotional_words = extract_emotional_words(preprocessed_text)
    emotional_words_column.append(", ".join(emotional_words))

artwork_data['emotional_words'] = emotional_words_column

artwork_data.to_csv("E:\\Project and Dissertation in Data Science\\dataset\\artemis_official_data\\processed_artwork_data.csv", index=False)

In [None]:
# audio dataset
audio_text = audio_data['caption']
emotional_words_column = []
for text in audio_text:
    preprocessed_text = text_preprocess(text)
    emotional_words = extract_emotional_words(preprocessed_text)
    emotional_words_column.append(", ".join(emotional_words))

audio_data['emotional_words'] = emotional_words_column

audio_data.to_csv("E:\\Project and Dissertation in Data Science\\dataset\\archive\\processed_audio_data.csv", index=False)

In [None]:
# load the dataset
artwork_data = pd.read_csv("E:\\Project and Dissertation in Data Science\\dataset\\artemis_official_data\\processed_artwork_data.csv")
# audio_data = pd.read_csv("E:\\Project and Dissertation in Data Science\\dataset\\archive\\processed_audio_data.csv")

In [None]:
# 对数据按照画作进行分组
grouped_data = artwork_data.groupby('painting')

# 存储每幅画作的主题词列表
painting_topics = []

# 遍历每个分组（每幅画）
for name, group in grouped_data:
    # 分词处理
    texts = group['utterance'].apply(text_preprocess).tolist()

    # 构建词典
    dictionary = corpora.Dictionary(texts)

    # 提取每个评论的词袋表示
    corpus = [dictionary.doc2bow(text) for text in texts]

    # 训练LDA模型
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, passes=10)

    # 提取每个主题的关键词
    topic_words = []
    for topic_id in range(lda_model.num_topics):
        topic_words.extend([word for word, _ in lda_model.show_topic(topic_id)])

    # 将主题词列表添加到结果中
    painting_topics.append({'painting': name, 'main_topic_words': ', '.join(topic_words)})

# 转换为DataFrame并显示在控制台上
result_df = pd.DataFrame(painting_topics)
result_df.to_csv("E:\\Project and Dissertation in Data Science\\dataset\\artemis_official_data\\processed_artwork_data2.csv")


MusicCaps Dataset

In [None]:
audio_data = pd.read_csv("E:\\Project and Dissertation in Data Science\\dataset\\archive\\updated_musiccaps-public.csv")

audio_data['audio_name'] = audio_data['ytid'] + '_' + audio_data['start_s'].astype(str) + '-' + audio_data['end_s'].astype(str)

audio_data[['audio_name', 'caption']].to_csv("E:\\Project and Dissertation in Data Science\\dataset\\archive\\audio_data.csv", index=False)

In [None]:
import pandas as pd

df1 = pd.read_csv("E:\\Project and Dissertation in Data Science\\dataset\\archive\\processed_audio_data.csv")
df2 = pd.read_csv("E:\\Project and Dissertation in Data Science\\dataset\\archive\\updated_musiccaps-public.csv")

column = df2['aspect_list']

df1['topic_words'] = column

df1.to_csv("E:\\Project and Dissertation in Data Science\\dataset\\archive\\processed_audio_data2.csv", index=False)

Use TF-IDF and TINYBERT to fuse two independent datasets based on the content. This concept is inspired by based-content recommendation algorithm.

Compare these two methods and choose the best one to achieve datasets fusion.

In [None]:
# load the dataset
artwork_data = pd.read_csv("E:\\Project and Dissertation in Data Science\\dataset\\artemis_official_data\\processed_artwork_data.csv")
audio_data = pd.read_csv("E:\\Project and Dissertation in Data Science\\dataset\\archive\\processed_audio_data2.csv")

## TF-IFD

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 构建TF-IDF向量化器
tfidf_vectorizer = TfidfVectorizer()

# 对艺术画作数据集和音乐数据集的评论进行向量化
artwork_comments_matrix = tfidf_vectorizer.fit_transform(artwork_data['utterance'])
music_comments_matrix = tfidf_vectorizer.transform(audio_data['caption'])

# 定义分批大小
batch_size = 1000

# 初始化匹配结果列表
matched_data = []

# 计算相似度矩阵
num_artworks = artwork_comments_matrix.shape[0]
num_batches = (num_artworks + batch_size - 1) // batch_size

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, num_artworks)

    # 获取当前批次的艺术画作描述性文本和音乐评论向量化矩阵
    artwork_batch = artwork_comments_matrix[start_idx:end_idx]

    # 计算当前批次的相似度矩阵
    similarity_matrix_batch = cosine_similarity(artwork_batch, music_comments_matrix)

    # 获取每幅艺术画作与音乐之间的最匹配评论的索引和相似度得分
    best_match_indices = similarity_matrix_batch.argmax(axis=1)
    best_match_scores = similarity_matrix_batch.max(axis=1)

    # 将匹配结果添加到匹配数据列表中
    for j, (artwork_index, score) in enumerate(zip(range(start_idx, end_idx), best_match_scores)):
        music_index = best_match_indices[j]
        artwork_name = artwork_data.loc[artwork_index, 'painting']
        artwork_text = artwork_data.loc[artwork_index, 'utterance']
        music_name = audio_data.loc[music_index, 'audio_name']
        music_comment = audio_data.loc[music_index, 'caption']
        matched_data.append([artwork_name, artwork_text, music_name, music_comment, score])

# 将匹配结果整理成新的DataFrame
matched_df = pd.DataFrame(matched_data, columns=['Artwork', 'Art_Utterance', 'Music_Name', 'Music_Comment', 'Similarity_Score'])

# 将结果保存到新的CSV文件中
matched_df.to_csv("E:\\Project and Dissertation in Data Science\\dataset\\matched_data_TF_IDF.csv", index=False)


# 计算模型指标
# 覆盖率：匹配的艺术画作评论和音乐评论的数量
total_artworks_matched = len(matched_df['Artwork'].unique())
total_music_matched = len(matched_df['Music_Name'].unique())
coverage_artwork = total_artworks_matched / len(artwork_data)
coverage_music = total_music_matched / len(audio_data)
coverage = (coverage_artwork + coverage_music) / 2

# 多样性：匹配的音乐评论的唯一数量
diversity = total_music_matched / total_matches if total_matches > 0 else 0

# 相似度的均值
similarity_mean = matched_df['Similarity_Score'].mean()

# 打印模型指标
print("Model Metrics:")
print("Coverage:", coverage)
print("Diversity:", diversity)
print("Similarity:", similarity_mean)

Model Metrics:
Coverage: 0.49776216517837696
Diversity: 0.008711586748470683
Similarity: 0.1324532315258462


## TINYBERT

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
model = BertModel.from_pretrained('prajjwal1/bert-tiny')

# 定义批量处理大小
batch_size = 500

# 初始化匹配结果列表
matched_data = []

# 计算批次数量
num_artworks = len(artwork_data)
num_batches = (num_artworks + batch_size - 1) // batch_size

# 批量处理
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, num_artworks)

    # 获取当前批次的艺术画作和音乐评论
    artwork_batch = artwork_data['utterance'][start_idx:end_idx]
    music_batch = audio_data['caption']

    # 文本编码
    artwork_tokens = tokenizer(list(artwork_batch), padding=True, truncation=True, return_tensors='pt')
    music_tokens = tokenizer(list(music_batch), padding=True, truncation=True, return_tensors='pt')

    # 计算文本的BERT编码
    with torch.no_grad():
        artwork_outputs = model(**artwork_tokens)
        music_outputs = model(**music_tokens)

    # 取BERT模型的最后一层的CLS token作为文本编码
    artwork_embeddings = artwork_outputs.last_hidden_state[:, 0, :].numpy()
    music_embeddings = music_outputs.last_hidden_state[:, 0, :].numpy()

    # 计算相似度矩阵
    similarity_matrix = cosine_similarity(artwork_embeddings, music_embeddings)

    # 获取每幅艺术画作与音乐之间的最匹配评论的索引和相似度得分
    best_match_indices = similarity_matrix.argmax(axis=1)
    best_match_scores = similarity_matrix.max(axis=1)

    # 将匹配结果添加到匹配数据列表中
    for j, (artwork_index, score) in enumerate(zip(range(start_idx, end_idx), best_match_scores)):
        music_index = best_match_indices[j]
        artwork_name = artwork_data.loc[artwork_index, 'painting']
        artwork_text = artwork_data.loc[artwork_index, 'utterance']
        music_name = audio_data.loc[music_index, 'audio_name']
        music_comment = audio_data.loc[music_index, 'caption']
        matched_data.append([artwork_name, artwork_text, music_name, music_comment, score])

# 将结果整理成新的DataFrame
matched_df = pd.DataFrame(matched_data, columns=['Artwork', 'Art_Utterance', 'Music_Name', 'Music_Comment', 'Similarity_Score'])

# 将结果保存到新的CSV文件中
matched_df.to_csv("E:\\Project and Dissertation in Data Science\\dataset\\matched_data_TINYBERT.csv", index=False)


# 计算模型指标
# 覆盖率：匹配的艺术画作评论和音乐评论的数量
total_artworks_matched = len(matched_df['Artwork'].unique())
total_music_matched = len(matched_df['Music_Name'].unique())
coverage_artwork = total_artworks_matched / len(artwork_data)
coverage_music = total_music_matched / len(audio_data)
coverage = (coverage_artwork + coverage_music) / 2

# 多样性：匹配的音乐评论的唯一数量
diversity = total_music_matched / total_matches if total_matches > 0 else 0

# 相似度的均值
similarity_mean = matched_df['Similarity_Score'].mean()

# 打印模型指标
print("Model Metrics:")
print("Coverage:", coverage)
print("Diversity:", diversity)
print("Similarity:", similarity_mean)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Model Metrics:
Coverage: 0.20477933710968926
Diversity: 0.0024816188900559435
Similarity: 0.77331626
