# Keyword analysis

This script used smoothed TF-IDF algorithm to extract ten most used words (lemmas) each year in the input dataset.

**Note:**
In acknowledgment of the contributions made, portions of this code were developed with the guidance and assistance of ChatGPT.

In [1]:
import ast
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from tqdm import tqdm
tqdm.pandas()

from google.colab import drive
drive.mount('/content/drive')

# input data
red_df_path = '/content/drive/MyDrive/Colab Notebooks/Masters_Thesis/0_corpus/preprocessed_for_semantic_shift_tfidf/red/preprocessed_red_df.csv'
red_df = pd.read_csv(red_df_path)
new_df = red_df.copy()

# Convert the lists in the column lemmatized_tokens into lists
new_df['tokens'] = new_df['lemmatized_tokens'].progress_apply(ast.literal_eval)

# Group data from the same year
def aggregate_tokens(x):
    return sum(x, [])

grouped = new_df.groupby('year')['tokens'].progress_apply(aggregate_tokens)
tokens_by_year_df = grouped.reset_index()
df = tokens_by_year_df

# Output data
grouped_data_path = '/content/drive/MyDrive/Colab Notebooks/Masters_Thesis/0_corpus/preprocessed_for_semantic_shift_tfidf/red/grouped_preprocessed_red_df.csv'
df = pd.read_csv(grouped_data_path)
df['tokens'] = df['tokens'].progress_apply(ast.literal_eval)

Mounted at /content/drive


100%|██████████| 10/10 [00:23<00:00,  2.32s/it]


In [None]:
df.head()

Unnamed: 0,year,tokens
0,2013,"[discuss, climatechange, skeptic, site, pull, ..."
1,2014,"[destroy, polar, vortex, earth, stop, spin, vo..."
2,2015,"[denier, rely, tired, mislead, analysis, unsou..."
3,2016,"[sure, talk, antarctica, contribute, sealevel,..."
4,2017,"[glad, hear, millennial, real, estate, mean, t..."


In [2]:
# Step 1: convert the token column into strings 
df['tokens_joined'] = df['tokens'].apply(lambda x: ' '.join(x))
df.head()

Output hidden; open in https://colab.research.google.com to view.

In [3]:
# Step 2: use CountVectorizer to compute the frequency of words 
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(df['tokens_joined'])

# Step 3: use TfidfTransformer to convert word frequenxy into TF-IDF
transformer = TfidfTransformer(smooth_idf=True)
X_tfidf = transformer.fit_transform(X_counts)

# Make the results as a DataFrame
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Step 4: move the results to a new DataFrame 
# ass the year column 
df_tfidf['year'] = df['year'].values

# make every token in a raw 
df_tfidf = df_tfidf.melt(id_vars=['year'], var_name='token', value_name='tfidf')

# compute the raw frequency 
df_freq = pd.DataFrame(X_counts.toarray(), columns=vectorizer.get_feature_names_out())
df_freq = df_freq.melt(value_name='raw_frequency')
df_freq['raw_frequency'] = df_freq.groupby('variable')['raw_frequency'].transform('sum')
df_freq = df_freq.drop_duplicates().rename(columns={'variable': 'token'})

# concat the raw frequency into the DataFrame df_tfidf
df_final = pd.merge(df_tfidf, df_freq, on='token', how='left')

# choose the columns 
df_final = df_final[['year', 'token', 'tfidf', 'raw_frequency']]

In [13]:
# choose the year I would like to know
selected_year = 2022

# filter the data in this year, and sort the results according to tfidf scores descendingly
df_selected_year = df_final[df_final['year'] == selected_year].sort_values(by='tfidf', ascending=False)

# choose the top 50 
top_50 = df_selected_year.head(50)

# print the result
print(top_50)

# save the result
top_50.to_csv(f'/content/drive/MyDrive/Colab Notebooks/Masters_Thesis/5_results/smoothed_tfidf/red/red_{selected_year}_top50.csv')

        year          token     tfidf  raw_frequency
76369   2022  carbondioxide  0.240719          32819
395999  2022         people  0.213062          30979
92589   2022  climatechange  0.202714          31858
83619   2022         change  0.181525          23695
92369   2022        climate  0.156578          24447
539959  2022           time  0.155192          22301
266059  2022       increase  0.151435          18907
358149  2022           need  0.136005          18717
171269  2022         energy  0.128829          15854
225239  2022           good  0.121345          18231
530269  2022    temperature  0.111890          16773
79829   2022          cause  0.106192          14477
329419  2022           mean  0.103143          15277
159599  2022          earth  0.100464          13799
168539  2022       emission  0.099632          14564
253209  2022          human  0.094550          13472
76229   2022         carbon  0.093349          12505
598469  2022          world  0.092887         