In [63]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import ast
import re
import spacy
from nltk import WordNetLemmatizer

In [3]:
df_epo_rel_on_science = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/Reliance on Science/cleantech_epo_rel_on_science_abstract_yake.json')

In [4]:
# Iterate over rows in dataframe
for index, row in tqdm(df_epo_rel_on_science.iterrows()):
    try:
        # Cast row 'keywords_yake' column to string
        row['keywords_yake_abstract'] = str(row['keywords_yake_abstract']).lower()
        # Check if keywords_yake column starts with "[[[" and ends with "]]]""
        if row['keywords_yake_abstract'].startswith('[[[') and row['keywords_yake_abstract'].endswith(']]]'):
            # Remove first "[" and last "]" from keywords_yake_claims column
            row['keywords_yake_abstract'] = row['keywords_yake_abstract'][1:-1]
        # Cast row 'keywords_yake_claims' column to list
        row['keywords_yake_abstract'] = ast.literal_eval(row['keywords_yake_abstract'])
        # Assign modified 'keywords_yake_claims' list to temporary variable
        keywords_yake_temp = row['keywords_yake_abstract']
        # Assign temporary variable to 'keywords_yake_claims' column
        df_epo_rel_on_science.at[index, 'keywords_yake_abstract'] = keywords_yake_temp
    except:
        print(f'Error on line {index}')

102980it [00:14, 7341.92it/s]


In [5]:
keywords_list_rel = []
yake_conf_score_list = []
oaid_list = []
min_yake_conf = 1

# Iterate over rows in dataframe
for index, row in tqdm(df_epo_rel_on_science.iterrows()):
    # Check if 'keywords_yake' column is not a list
    if not isinstance(row['keywords_yake_abstract'], list):
        continue
    # Check if 'keywords_yake' column is an empty list or contains only empty lists
    if not any(keyword for keyword in row['keywords_yake_abstract']):
        continue
    # Iterate over keywords in 'keywords_yake' column and append to keywords_list_rel
    else:
        for keyword in row['keywords_yake_abstract']:
            if keyword[1] <= min_yake_conf:
                keywords_list_rel.append(keyword[0])
                yake_conf_score_list.append(keyword[1])
                oaid_list.append(row['oaid'])

# Create new dataframe
df_keywords_list_rel = pd.DataFrame({
    'keyword_yake': keywords_list_rel,
    'yake_conf_score': yake_conf_score_list,
    'oaid': oaid_list,
    'abs_frequency': 1
})

102980it [00:05, 19116.68it/s]


In [6]:
# Filter out non-alphanumeric keywords
df_keywords_list_rel = df_keywords_list_rel[
    df_keywords_list_rel['keyword_yake'].progress_apply(lambda x: all(word.isalnum() for word in x.split()))
]

100%|██████████| 1018283/1018283 [00:01<00:00, 945897.54it/s]


In [7]:
# Filter out all keywords shorter than 3 characters
df_keywords_list_rel = df_keywords_list_rel[
    df_keywords_list_rel['keyword_yake'].progress_apply(lambda x: len(x) > 2)
]

100%|██████████| 929827/929827 [00:00<00:00, 2011305.20it/s]


In [8]:
# Define a function to check if a string is an abbreviation
def is_abbreviation(keyword):
    # Regular expression to identify abbreviations (typically all uppercase and periods)
    # and check for all-uppercase abbreviations with 3 or fewer characters
    pattern = re.compile(r'\b(?:[A-Z]{1,}\.){2,}\b|\b[A-Z]{1,3}\b')
    return pattern.match(keyword) is not None

# Apply the function to filter out abbreviations
df_keywords_list_rel = df_keywords_list_rel[
    df_keywords_list_rel['keyword_yake'].progress_apply(lambda x: not is_abbreviation(x))
]

100%|██████████| 929827/929827 [00:00<00:00, 1011639.66it/s]


In [9]:
import nltk

stopwords = set(nltk.corpus.stopwords.words('english'))

# Function to remove keywords that are only one stopword or start/end with a stopword
def remove_stopwords(keyword):
    words = keyword.split()
    
    # If the keyword is a single stopword, remove it
    if len(words) == 1 and words[0] in stopwords:
        return ''
    
    # If the keyword starts or ends with a stopword, remove line
    if words[0] in stopwords:
        return ''
    if words and words[-1] in stopwords:
        return ''
    
    return ' '.join(words)

# Apply the function to remove stopwords
df_keywords_list_rel['keyword_yake'] = df_keywords_list_rel['keyword_yake'].progress_apply(remove_stopwords)

# Remove empty keywords
df_keywords_list_rel = df_keywords_list_rel[
    df_keywords_list_rel['keyword_yake'].progress_apply(lambda x: len(x) > 0)
]

100%|██████████| 929827/929827 [00:00<00:00, 1246341.12it/s]
100%|██████████| 929827/929827 [00:00<00:00, 2102413.53it/s]


In [None]:
# Download spacy model
nlp = spacy.load('en_core_web_sm')

# Filter out all keywords that do not contain NOUN, PRON, or PROPN in their POS tags
df_keywords_list_rel = df_keywords_list_rel[df_keywords_list_rel['keyword_yake'].progress_apply(
    lambda keyword: any(token.pos_ in ['NOUN', 'PRON', 'PROPN'] for token in nlp(keyword))
)]

# Reset index
df_keywords_list_rel.reset_index(drop=True, inplace=True)

In [11]:
lemmatizer = WordNetLemmatizer()
# Function to lemmatize each word in a keyword string
def lemmatize_keyword(keyword):
    words = keyword.split()  # Split the keyword into individual words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)  # Join the lemmatized words back into a string

# Lemmatize each keyword string in the 'keyword_yake' column
df_keywords_list_rel['keyword_yake_lemma'] = df_keywords_list_rel['keyword_yake'].progress_apply(lemmatize_keyword)

100%|██████████| 929797/929797 [00:05<00:00, 162837.45it/s]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Lemmatize the 'abstract' column in df_epo_rel_on_science
def lemmatize_text(text):
    return " ".join([token.lemma_ for token in nlp(text)])

# Applying the lemmatization function to each abstract
df_epo_rel_on_science['lemmatized_abstract'] = df_epo_rel_on_science['abstract'].progress_apply(lemmatize_text)

In [65]:
# Extracting the unique keywords from 'keyword_yake_lemma'
unique_keywords = df_keywords_list_rel['keyword_yake_lemma'].unique().tolist()

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer(vocabulary=unique_keywords)

# Assuming tfidf_matrix is a scipy sparse matrix
tfidf_matrix = vectorizer.fit_transform(df_epo_rel_on_science['lemmatized_abstract'])

# Calculate max and mean values for each term in the TF-IDF matrix
max_tfidf = tfidf_matrix.max(axis=0).toarray().ravel()

# Directly use ravel() as mean() returns a numpy matrix
mean_tfidf = np.squeeze(np.asarray(tfidf_matrix.mean(axis=0)))

In [79]:
df_keywords_list_rel.head()

Unnamed: 0,keyword_yake,yake_conf_score,oaid,abs_frequency,keyword_yake_lemma,max_tfidf,mean_tfidf
0,flora,0.063503,2092669321,1,flora,0.523987,0.000257
1,preterm,0.109574,2092669321,1,preterm,0.663905,7.5e-05
2,flora included bifidobacterium,0.111464,2092669321,1,flora included bifidobacterium,0.0,0.0
3,environment,0.112817,2092669321,1,environment,0.453096,0.001909
4,quails,0.144876,2092669321,1,quail,0.673007,3.9e-05


In [78]:
# Create a DataFrame with these statistics
tfidf_stats_df = pd.DataFrame({
    'keyword_yake_lemma': vectorizer.get_feature_names_out(),
    'max_tfidf': max_tfidf,
    'mean_tfidf': mean_tfidf
})

# Merge with your existing DataFrame
df_keywords_list_rel = df_keywords_list_rel.merge(tfidf_stats_df, on='keyword_yake_lemma', how='left')

In [80]:
# print the mean of the max_tfidf column
print(df_keywords_list_rel['max_tfidf'].mean())

0.21700028754397543


In [82]:
# Aggregate df_keywords_list_ep by 'keyword'
df_keywords_list_rel_agg = df_keywords_list_rel.groupby(['keyword_yake_lemma']).agg({
    'yake_conf_score': 'mean',
    'oaid': list,
    'abs_frequency': 'count',
    'max_tfidf': 'max',
    'mean_tfidf': 'mean'
}).reset_index()