# Keyword analysis

This script used smoothed TF-IDF algorithm to extract ten most used words (lemmas) each year in the input dataset.

**Note:**
In acknowledgment of the contributions made, portions of this code were developed with the guidance and assistance of ChatGPT.

In [1]:
import ast
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from tqdm import tqdm
tqdm.pandas()

from google.colab import drive
drive.mount('/content/drive')

# input data
aca_df_path = '/content/drive/MyDrive/Colab Notebooks/Masters_Thesis/0_corpus/preprocessed_for_semantic_shift_tfidf/aca/preprocessed_aca_df.csv'
aca_df = pd.read_csv(aca_df_path)
new_df = aca_df.copy()

# Convert the lists in the column lemmatized_tokens into lists
new_df['tokens'] = new_df['lemmatized_tokens'].progress_apply(ast.literal_eval)

# Group data from the same year
def aggregate_tokens(x):
    return sum(x, [])

grouped = new_df.groupby('year')['tokens'].progress_apply(aggregate_tokens)
tokens_by_year_df = grouped.reset_index()
df = tokens_by_year_df

# Output data
grouped_data_path = '/content/drive/MyDrive/Colab Notebooks/Masters_Thesis/0_corpus/preprocessed_for_semantic_shift_tfidf/aca/grouped_preprocessed_aca_df.csv'
df = pd.read_csv(grouped_data_path)
df['tokens'] = df['tokens'].progress_apply(ast.literal_eval)

Mounted at /content/drive


100%|██████████| 10/10 [00:07<00:00,  1.36it/s]


In [None]:
df.head()

Unnamed: 0,year,tokens
0,2013,"[evaluation, urban, citizens, awareness, clima..."
1,2014,"[public, perceptionclimate, risk, adaptation, ..."
2,2015,"[comprehensive, local, climatepolicy, role, ur..."
3,2016,"[world, regionalization, climatechange, 1961, ..."
4,2017,"[florida, puerto, rico, secondary, science, te..."


In [2]:
# step 1: convert list into string
df['tokens_joined'] = df['tokens'].apply(lambda x: ' '.join(x))
df.head()

Output hidden; open in https://colab.research.google.com to view.

In [3]:
# Step 2: use CountVectorizer to compute the frequency of words 
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(df['tokens_joined'])

# Step 3: use TfidfTransformer to convert word frequenxy into TF-IDF
transformer = TfidfTransformer(smooth_idf=True)
X_tfidf = transformer.fit_transform(X_counts)

# Make the results as a DataFrame
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Step 4: move the results to a new DataFrame 
# ass the year column 
df_tfidf['year'] = df['year'].values

# make every token in a raw 
df_tfidf = df_tfidf.melt(id_vars=['year'], var_name='token', value_name='tfidf')

# compute the raw frequency 
df_freq = pd.DataFrame(X_counts.toarray(), columns=vectorizer.get_feature_names_out())
df_freq = df_freq.melt(value_name='raw_frequency')
df_freq['raw_frequency'] = df_freq.groupby('variable')['raw_frequency'].transform('sum')
df_freq = df_freq.drop_duplicates().rename(columns={'variable': 'token'})

# concat the raw frequency into the DataFrame df_tfidf
df_final = pd.merge(df_tfidf, df_freq, on='token', how='left')

# choose the columns 
df_final = df_final[['year', 'token', 'tfidf', 'raw_frequency']]

  df_tfidf = df_tfidf.melt(id_vars=['year'], var_name='token', value_name='tfidf')


In [13]:
# choose the year I would like to know
selected_year = 2022

# filter the data in this year, and sort the results according to tfidf scores descendingly
df_selected_year = df_final[df_final['year'] == selected_year].sort_values(by='tfidf', ascending=False)

# choose the top 50 
top_50 = df_selected_year.head(50)

# print the result
print(top_50)

# save the result
top_50.to_csv(f'/content/drive/MyDrive/Colab Notebooks/Masters_Thesis/5_results/smoothed_tfidf/wos/wos_{selected_year}_top50.csv')

        year          token     tfidf  raw_frequency
71649   2022  climatechange  0.644745          58704
66559   2022         change  0.366677          35870
71589   2022        climate  0.239513          19419
301699  2022          study  0.147217          11520
163029  2022       increase  0.118392          11013
160749  2022         impact  0.113011          10889
208069  2022          model  0.108662          11039
270999  2022         result  0.096645           8810
269679  2022       research  0.084556           6451
246649  2022         policy  0.075119           6929
135019  2022         future  0.072760           6978
46099   2022           base  0.072466           6140
23649   2022     adaptation  0.069738           7551
273789  2022           risk  0.069075           6039
111129  2022         effect  0.067379           6172
37779   2022           area  0.064357           5594
32339   2022       analysis  0.064283           5289
151369  2022           high  0.064062         