In [None]:
!pip install pandas numpy matplotlib nltk tqdm scikit-learn seaborn

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm
nltk.download('vader_lexicon')
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import seaborn as sns

Load preprocessed files

In [None]:
with open('preprocessed_docs.pkl', 'rb') as f:
    preprocessed_docs = pickle.load(f)

with open('bow_corpus.pkl', 'rb') as f:
    bow_corpus = pickle.load(f)

with open('dictionary.dict', 'rb') as f:
    dictionary = pickle.load(f)

with open('lda_model.pkl', 'rb') as f:
    lda_model = pickle.load(f)

with open('tfidf_model.pkl', 'rb') as f:
    tfidf_model = pickle.load(f)

In [None]:
# Apply the TF-IDF model to the BOW corpus
tfidf_corpus = tfidf_model[bow_corpus]

# Load the dataset
file_name = '/content/drive/My Drive/Unsupervised aging literature/LDA model docs/LDA model 4 with cleaned data/cleaned_dataset.csv'
df = pd.read_csv(file_name, low_memory=False)

# Filter and preprocess the dataset
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df = df[(df['Year'] >= 1975) & (df['Year'] <= 2023)]
df.reset_index(drop=True, inplace=True)

# Align df with preprocessed_docs
df = df.iloc[:len(preprocessed_docs)]

# Print lengths to verify
print("Length of preprocessed_docs:", len(preprocessed_docs))
print("Length of bow_corpus:", len(bow_corpus))
print("Length of df:", len(df))

Sentiment analysis of each document and storing of keyword sentiment scores

In [None]:
# Initialize the sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Perform sentiment analysis on each document
df['Sentiment'] = df['Abstract'].apply(lambda text: sid.polarity_scores(text)['compound'])

# Extract keywords using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Abstract'])
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a dictionary to store keyword sentiment scores
keyword_sentiment = {word: [] for word in feature_names}

# Assign sentiment scores to keywords
for i, abstract in enumerate(df['Abstract']):
    words = abstract.split()
    for word in words:
        if word in keyword_sentiment:
            keyword_sentiment[word].append(df.iloc[i]['Sentiment'])

Generate a df with the average sentiment score for each keyword, abstract and year of publication

In [None]:
# Calculate the average sentiment score for each keyword and convert to a df
average_keyword_sentiment = {word: (sum(scores) / len(scores)) if scores else 0 for word, scores in keyword_sentiment.items()}
keyword_sentiment_df = pd.DataFrame(list(average_keyword_sentiment.items()), columns=['Keyword', 'Average Sentiment'])
years = df['Year'].tolist()
data = {'Year': years, 'Abstract': df['Abstract'], 'Sentiment': df['Sentiment']}
df_sentiment = pd.DataFrame(data)

# Group by year and calculate average sentiment for each keyword per year
keyword_yearly_sentiment = {word: [] for word in feature_names}
for word in feature_names:
    for year in df_sentiment['Year'].unique():
        yearly_abstracts = df_sentiment[df_sentiment['Year'] == year]['Abstract']
        yearly_sentiments = [sid.polarity_scores(abstract)['compound'] for abstract in yearly_abstracts if word in abstract.split()]
        if yearly_sentiments:
            average_yearly_sentiment = sum(yearly_sentiments) / len(yearly_sentiments)
        else:
            average_yearly_sentiment = 0
        keyword_yearly_sentiment[word].append((year, average_yearly_sentiment))

keyword_yearly_sentiment_df = pd.DataFrame.from_dict(keyword_yearly_sentiment, orient='index', columns=sorted(df_sentiment['Year'].unique()))

Plot 10 keywords with highest and lowest sentiment score

In [None]:
average_keyword_sentiment = numeric_keyword_yearly_sentiment_df.mean(axis=1)
top_positive_keywords = average_keyword_sentiment.nlargest(10)
top_negative_keywords = average_keyword_sentiment.nsmallest(10)

# Capitalize the first letter of each keyword
top_positive_keywords.index = top_positive_keywords.index.str.capitalize()
top_negative_keywords.index = top_negative_keywords.index.str.capitalize()

# Plot top positive keywords
plt.figure(figsize=(14, 7))
plt.subplot(1, 2, 1)
sns.barplot(x=top_positive_keywords.values, y=top_positive_keywords.index, palette=sns.color_palette("Reds", 10)[::-1])
plt.xlabel('Average Sentiment Score')
plt.title('Top 10 Positive Keywords')

# Plot top negative keywords
plt.subplot(1, 2, 2)
sns.barplot(x=top_negative_keywords.values, y=top_negative_keywords.index, palette=sns.color_palette("Blues", 10)[::-1])
plt.xlabel('Average Sentiment Score')
plt.title('Top 10 Negative Keywords')

plt.tight_layout()
plt.savefig(f'Top 10 positive and negative keywords.pdf')
plt.show()

Plot the evolution of sentiment score for all documents (10-year moving avg)

In [None]:
average_sentiment_per_year = df_sentiment.groupby('Year')['Sentiment'].mean()
moving_avg_sentiment = average_sentiment_per_year.rolling(window=10, min_periods=1).mean()
plt.figure(figsize=(10, 6))
plt.plot(moving_avg_sentiment.index, moving_avg_sentiment, linestyle='-', color='darkgreen', label='10-Year Moving Average')
plt.xlabel('Year')
plt.ylabel('Average Sentiment Score')
plt.title('10-Year Moving Average of Sentiment Score Across All Documents')
plt.legend()
plt.grid(False)
plt.savefig(f'/content/drive/MyDrive/Unsupervised aging literature/LDA model docs/LDA model 4 with cleaned data/Figuras paper/Sentiment topics/10-year moving average of sentiment score across all documents.pdf', dpi=600)
plt.show()

Plot the evolution of proportions of documents containing a specific word (10-year moving avg)

In [None]:
df['word_presence'] = df['Abstract'].str.contains('cancer', case=False, na=False).astype(int)
yearly_document_count = df.groupby('Year').size()
yearly_word_count = df.groupby('Year')['word_presence'].sum()
yearly_word_proportion = yearly_word_count / yearly_document_count
word_moving_avg = yearly_cancer_proportion.rolling(window=10, min_periods=1).mean()

plt.figure(figsize=(10, 6))
plt.plot(word_moving_avg.index, word_moving_avg, linestyle='-', color='b', label='10-Year Moving Average')
plt.xlabel('Year')
plt.ylabel('Proportion of Documents')
plt.title('10-Year Moving Average of the Proportion of Documents containing "Cancer"')
plt.legend()
plt.grid(False)
plt.savefig(f'10-year moving average of proportion of documents containing cancer.pdf', dpi=600)
plt.show()

Average sentiment score per LDA topic

In [None]:
# Define topic names
topic_names = [
    "Cell signaling", "Development", "CNS diseases", "Cardiovascular", "Age-related decline", "Risk factors",
    "Cell biology", "Gender", "Muscle", "Oxidative stress", "Bone", "Therapeutics", "Metabolism", "Neural tissue",
    "Clinics", "Healthcare", "General terms", "Brain structure", "Psychosocial", "Rodent studies", "Cancer",
    "Physical activity", "Demography", "Liver and kidney", "Genetics", "Analytics", "Cognition", "Physics", "Skin",
    "Clinical tests"
]

# Map keywords to their dominant topics
keyword_to_topic = {}
for word in feature_names:
    if word in dictionary.token2id:
        word_id = dictionary.token2id[word]
        topic_distribution = lda_model.get_term_topics(word_id)
        if topic_distribution:
            dominant_topic = max(topic_distribution, key=lambda x: x[1])[0]
            keyword_to_topic[word] = dominant_topic

# Calculate average sentiment per topic based on keyword sentiment
topic_sentiment = {topic: [] for topic in range(len(topic_names))}
for word, sentiment in average_keyword_sentiment.items():
    if word in keyword_to_topic:
        topic = keyword_to_topic[word]
        topic_sentiment[topic].append(sentiment)
average_sentiment_per_topic = {topic: (sum(scores) / len(scores)) if scores else 0 for topic, scores in topic_sentiment.items()}
average_sentiment_per_topic_df = pd.DataFrame(list(average_sentiment_per_topic.items()), columns=['Topic', 'Average Sentiment'])
average_sentiment_per_topic_df['Topic_Name'] = average_sentiment_per_topic_df['Topic'].apply(lambda x: topic_names[x])
average_sentiment_per_topic_df = average_sentiment_per_topic_df.sort_values(by='Average Sentiment', ascending=False)

colors = []
for sentiment in average_sentiment_per_topic_df['Average Sentiment']:
    if sentiment > 0:
        colors.append(sns.color_palette("Reds", as_cmap=True)(0.4 + sentiment / 2))
    else:
        colors.append(sns.color_palette("Blues", as_cmap=True)(0.4 - sentiment / 2))
plt.figure(figsize=(12, 8))
sns.barplot(x='Average Sentiment', y='Topic_Name', data=average_sentiment_per_topic_df, palette=colors)
plt.xlabel('Average Sentiment Score')
plt.title('Average Sentiment per Topic')
plt.tight_layout()
plt.savefig(f'Average_sentiment_per_topic.pdf')
plt.show()