In [1]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.7.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pygraphviz

Collecting pygraphviz
  Downloading pygraphviz-1.13.tar.gz (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.6/104.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: pygraphviz
  Building wheel for pygraphviz (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pygraphviz: filename=pygraphviz-1.13-cp310-cp310-linux_x86_64.whl size=97929 sha256=c7934e83450c10c4e2225c6e68c9ebec14da59b0c925c5a4bbf422a01bd73944
  Stored in directory: /root/.cache/pip/wheels/c5/96/10/6c25add1fffc368b1927252bf73b63fcb938de8f4486e23691
Successfully built pygraphviz
Installing collected packages: pygraphviz
Successfully installed pygraphviz-1.13
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
from nltk.corpus import stopwords
import nltk
import re
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
nltk.download("stopwords")
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import community as community_louvain
import pickle 
import os
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
model = SentenceTransformer('WhereIsAI/UAE-Large-V1')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/65.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [5]:
df = pd.read_csv("../data/main_file.csv")

In [7]:
df_2018 = df.loc[df.year == 2018].dropna(subset=['abstract'])
df_2019 = df.loc[df.year == 2019].dropna(subset=['abstract'])
df_2020 = df.loc[df.year == 2020].dropna(subset=['abstract'])
df_2021 = df.loc[df.year == 2021].dropna(subset=['abstract'])
df_2022 = df.loc[df.year == 2022].dropna(subset=['abstract'])
df_2023 = df.loc[df.year == 2023].dropna(subset=['abstract'])

In [8]:
def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Change "'t" to "not"
    - Remove "@name"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    """
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\!\?\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?\.\']', ' ', s)
    # Remove number
    s = re.sub(r'[0-9]', '', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()

    return s

In [9]:
clean_cols = ['abstract']

txt_pipe = Pipeline([('clean_text', FunctionTransformer(lambda x: x.applymap(text_preprocessing)))])

col_trans = ColumnTransformer(transformers=[
    ('txt_pipe', txt_pipe, clean_cols),
    ], )

In [None]:
os.makedirs('graphs_info', exist_ok=True)
for y in ['2018', '2019', '2020', '2021', '2022', '2023']:
    folder_name = f'../../data/{y}'
    os.makedirs(folder_name, exist_ok=True)
    
    year_df = eval(f'df_{y}')
    year_df[clean_cols] = col_trans.fit_transform(year_df)
    words = year_df.abstract.values
    vectors = model.encode(words)
    year_df['abs_vector'] = vectors.tolist()
    
    year_df.to_csv(os.path.join(folder_name, f'{y}_paper_info.csv'))
    
    for i in range(1,13):
        rows_month_i = year_df.loc[year_df.month==i]
        vector = list(rows_month_i['abs_vector'].values)
        index_vector = rows_month_i.index
        if (len(vector) == 0 ): break
        sims = cosine_similarity(vector, vector)
        for j in range(len(vector)):
            for k in range(len(vector)):
                if j<=k:
                    sims[j, k] = False
        indices = np.argwhere(sims > 0.65)
        print(f'indices shape = {indices.shape}')
        tmp_df = pd.DataFrame(columns=['target', 'source', 'weight'])
        G = nx.Graph()

        for index in indices:
            target = index_vector[index[0]]
            source = index_vector[index[1]]
            weight = sims[index[0], index[1]]
            app_df = pd.DataFrame({'target': [target], 'source': [source], 'weight': [weight]})
            tmp_df = pd.concat([tmp_df, app_df])
            G.add_edge(target, source , weight=weight)

        pos = nx.spring_layout(G,dim=3, seed=123)
        partition = community_louvain.best_partition(G)
        
        set_partition = set(partition.values())
        key_list = np.array(words[list(partition.keys())])
        val_list = np.array(list(partition.values()))
        partition_nodes = {pid: key_list[np.where(val_list ==pid)] for pid in set_partition}
        keywords_group = dict()
        tf_idf = TfidfVectorizer()
        top_n = 5
        for pid in set_partition :
            output = tf_idf.fit_transform(partition_nodes[pid])
            feature_names = tf_idf.get_feature_names_out()
            tfidf_scores = output.max(0).toarray()[0]
            important_words = {word: score for word, score in zip(feature_names, tfidf_scores)}
            important_words_sorted = dict(sorted(important_words.items(), key=lambda x: x[1], reverse=True))
            annotation_text = "/ ".join(list(important_words_sorted.keys())[:top_n])
            keywords_group[pid] = annotation_text
        
        dc = nx.degree_centrality(G)
        bc = nx.betweenness_centrality(G)

        folder_name = f'../../data/{y}/{i}_month'
        os.makedirs(folder_name, exist_ok=True)
        with open(os.path.join(folder_name, f'topics.pkl'), 'wb') as f:
            pickle.dump(keywords_group, f)        
        with open(os.path.join(folder_name, f'pos.pkl'), 'wb') as f:
            pickle.dump(pos, f)
        with open(os.path.join(folder_name, f'partition.pkl'), 'wb') as f:
            pickle.dump(partition, f)
        with open(os.path.join(folder_name, f'degree_centrality.pkl'), 'wb') as f:
            pickle.dump(dc, f)
        with open(os.path.join(folder_name, f'between_centrality.pkl'), 'wb') as f:
            pickle.dump(bc, f)
        tmp_df.to_csv(os.path.join(folder_name, f'graph.csv'))
    print(f'Done year {y}')
    
print(f'All Done!!')