# Imports

In [1]:
import os, random, re
from typing import Tuple, List, Dict, NewType

import pandas as pd

# File reading

In [2]:
# !unzip -q /content/meta.zip

In [3]:
META_ARTICLES_DIRECTORY_PATH = '/content/meta/articles/'

## Testing file reading

In [4]:
from typing import Tuple
import random
import os
def test_file_opening(chosen_file_to_open: str = None) -> bool:
    if not chosen_file_to_open:
        chosen_file_to_open = random.choice(os.listdir(META_ARTICLES_DIRECTORY_PATH))
    chosen_file_to_open_path = os.path.join(META_ARTICLES_DIRECTORY_PATH, chosen_file_to_open)
    chosen_file = open(chosen_file_to_open_path, "r")

    try:
        print(f"Chosen file ({chosen_file_to_open}):")
        print(chosen_file.read(),end="")
        print("-"*80)
        print("Everything seems to work just fine!")
        return True
    except Exception as e:
        print("Something went wrong!")
        print(f"Error: {e}")
        return False

In [5]:
# %%capture
import random

chosen_file_to_open = "0001001.abs"
chosen_result = test_file_opening(chosen_file_to_open)
print()

random_file_to_open = random.choice(os.listdir(META_ARTICLES_DIRECTORY_PATH))
random_result = test_file_opening()
print()

if chosen_result and random_result:
    print("-"*80)
    print(f"Both Chosen file and Random file opened just fine!")
else:
    print(f"Something went wrong with {'chosen file' if not chosen_result else ('random file' if not random_result else 'both chosen and random files')}!")

Chosen file (0001001.abs):
------------------------------------------------------------------------------
\\
Paper: hep-th/0001001
From: Paul S. Aspinwall <psa@math.duke.edu>
Date: Sat, 1 Jan 2000 00:02:31 GMT   (84kb)
Date (revised v2): Mon, 17 Jan 2000 14:52:43 GMT   (85kb)

Title: Compactification, Geometry and Duality: N=2
Authors: Paul S. Aspinwall
Comments: 82 pages, 8 figures, LaTeX2e, TASI99, refs added and some typos fixed
Report-no: DUKE-CGTP-00-01
\\
  These are notes based on lectures given at TASI99. We review the geometry of
the moduli space of N=2 theories in four dimensions from the point of view of
superstring compactification. The cases of a type IIA or type IIB string
compactified on a Calabi-Yau threefold and the heterotic string compactified on
K3xT2 are each considered in detail. We pay specific attention to the
differences between N=2 theories and N>2 theories. The moduli spaces of vector
multiplets and the moduli spaces of hypermultiplets are reviewed. In the ca

## Reading files and saving in the memory

In [6]:
# Reading files in the directory
from typing import Tuple, List, Dict, NewType
Filename = NewType("Filename", str)
FileContent = NewType("FileContent", str)
FileDict = Dict[Filename, FileContent]
def read_files(directory: str) -> Tuple[int, FileDict]:
    file_counter = 0
    files = {}
    for filename in os.listdir(META_ARTICLES_DIRECTORY_PATH):
        file_counter += 1
        try:
            f = open(META_ARTICLES_DIRECTORY_PATH + filename , "r")
            lines = f.read()
            files[filename.replace(".abs", "")] = lines
        except Exception as e:
            continue
    return file_counter, files

In [7]:
%%capture
file_counter, files = read_files(META_ARTICLES_DIRECTORY_PATH)

In [8]:
print(f"Amount of files read: {file_counter}")
print(f"Amount of resulted files: {len(files)}")
print("Everything adds up!") if file_counter == len(files) else print(f"There seems to be {file_counter - len(files)} missing files!")

Amount of files read: 29555
Amount of resulted files: 29555
Everything adds up!


---

# Article Generation

## Classes and Functions

### Article Class

In [9]:
categories = ["Paper_ID", "Authors", "Title", "Abstract", "Subjects", "Comments"]
class Article:
    def __init__(self):
        self.paper_id = 0
        self.authors = []
        self.title = ""
        self.abstract = ""
        self.subjects = []
        self.comments = 0
    def __dict__(self):
        return {"paper_id": self.paper_id, "authors": self.authors, "title": self.title, "abstract": self.abstract, "subjects": self.subjects, "comments": self.comments}

### Preprocessing

#### File's Meta Preprocessing

In [10]:
from typing import List
def preprocess_file_meta(file_content: str) -> List[str]:
    file_meta = []
    continous_line = True
    file_meta_lines = list(filter(lambda x: x, file_content.split("\n")))
    file_meta.append(file_meta_lines[0])
    for file_meta_line_index in range(1, len(file_meta_lines)):
        meta_line = file_meta_lines[file_meta_line_index]
        if (meta_line.startswith("Author")
            or meta_line.startswith("Title")
            or meta_line.startswith("Subj-class")
            or meta_line.startswith("Comments")
            ):
            file_meta.append(meta_line)
            continous_line = True
        elif meta_line.startswith(r"  ") and continous_line:
            file_meta[-1] = file_meta[-1] + meta_line
        else:
            continous_line = False
    return file_meta

#### Author Name Preprocessing

In [11]:
import re
def preprocess_name_concatenations(name : str) -> str:
    while True:
        search = re.search(r'(?![A-Z])([a-z]|[A-Z]|\.)[A-Z](?![A-Z])', name)
        if search:
            name = name[0:search.start()+1] + " " + name[search.end()-1:]
        else:
            break
    return name

In [12]:
import re
def preprocess_name_initials(name : str) -> str:
    while True:
        search = re.search(r'[A-Z]\b(?!\.)', name)
        if search:
            name = name[0:search.start()+1] + "." + name[search.end():]
        else:
            break
    return name

In [13]:
import re
def preprocess_author_name_pipeline(author_name: str) -> str:
    author_name = author_name.replace("\n", " ")
    
    author_name = re.sub(r'\(', ' ( ', author_name) 
    author_name = re.sub(r'\)', ' ) ', author_name)  

    while True:
        if not re.search(r'\((?!\().*?\)', author_name):
            break
        author_name = re.sub(r'\((?!\().*?\)', ',', author_name)    
    
    author_name = re.sub(r'\s{2,}', ' ', author_name)
    author_name = re.sub(r'[^a-zA-Z., ]+', '', author_name)
    
    author_name = preprocess_name_concatenations(author_name)
    author_name = preprocess_name_initials(author_name)

    author_name = author_name.strip()

    return author_name

#### Default Preprocessing
Intended for `Title`, `Abstract` and `Subjects`

In [14]:
import re
def default_preprocess_pipeline(string: str) -> str:
    string = string.replace("\n", " ")
    string = re.sub(r'\s{2,}', ' ', string)
    
    string = string.strip()

    return string

### Processing Files

In [15]:
from typing import List
def process_files(files: List[str]) -> List[Article]:
    articles = []
    for filename, file_content in files.items():

        new_article = Article()
        new_article.paper_id = filename
        file_content = file_content.split(r"\\")
        file_meta = file_content[1]
        file_meta = preprocess_file_meta(file_meta)

        for meta in file_meta:
            if meta.startswith("Author"):

                postfix_string = None
                if meta.startswith("Authors: "):
                    postfix_string = meta[len("Authors: "):]
                elif meta.startswith("Author: "):
                    postfix_string = meta[len("Author: "):]

                postfix_string = preprocess_author_name_pipeline(postfix_string)
                new_authors = re.split(r",| and ", postfix_string)
                new_authors = list(map(preprocess_author_name_pipeline, new_authors))
                new_authors = list(filter(lambda x: len(x)>3, new_authors))
                new_article.authors = new_authors

            elif meta.startswith("Title:"):
                postfix_string = meta[len("Title: "):]
                title = default_preprocess_pipeline(postfix_string)
                new_article.title = title

            elif meta.startswith("Subj-class:"):
                postfix_string = meta[len("Subj-class: "):]
                subjects = default_preprocess_pipeline(postfix_string)
                subjects = re.split(r',|;', subjects)
                subjects = list(map(default_preprocess_pipeline, subjects))
                subjects = list(filter(lambda x: len(x)>3, subjects))
                new_article.subjects = subjects

            elif meta.startswith("Comments:"):
                postfix_string = meta[len("Comments: "):]
                search = re.search(r"(\d+)(?=.*p)", postfix_string)
                if search:
                    new_article.comments = int(search.group(0))

        file_abstract = file_content[2]
        file_abstract = default_preprocess_pipeline(file_abstract)
        new_article.abstract = file_abstract

        articles.append(new_article)
        
    return articles

## Article Generation from files

In [16]:
# Process articles
articles = process_files(files)
# Make dictionaries out of class
articles = [article.__dict__() for article in articles]

## Testing Article Generation

In [17]:
# Checking, if all files were converted
print(f"Files in dataset: {len(files)}")
print(f"Converted articles: {len(articles)}")
print(f"Everything converted just fine!") if len(files) == len(articles) \
else print(f"{len(files) - len(articles)} (around {(100 - (len(articles)/len(files))*100):.3f}%) were lost during conversion!")

Files in dataset: 29555
Converted articles: 29555
Everything converted just fine!


In [18]:
# Getting random article to check
import random
paper_id = "9907150"
chosen_article = next(filter(lambda a: a['paper_id'] == paper_id, articles))
random_article = random.choice(articles)
print("Chosen article:", chosen_article)
print()
print("Random article:", random_article)

Chosen article: {'paper_id': '9907150', 'authors': ['Fabio Scardigli'], 'title': 'Gravity coupling from micro-black holes', 'abstract': 'Recently much work has been done in lowering the Planck threshold of quantum gravitational effects (sub-millimeter dimension(s), Horava-Witten fifth dimension, strings or branes low energy effects, etc.). Working in the framework of 4-dim gravity, with semi-classical considerations based on Hawking evaporation of planckian micro-black holes, I shall show here as quantum gravity effects could occur also near GUT energies.', 'subjects': [], 'comments': 5}

Random article: {'paper_id': '9401071', 'authors': ['Mikhail S. Volkov'], 'title': 'Fermion number non-conservation and gravity', 'abstract': 'It is shown that in the Einstein-Yang-Mills (EYM) theory, as well as in the pure flat space Yang-Mills (YM) theory, there always exists an opportunity to pass over the potential barrier separating homotopically distinct vacuum sectors, because the barrier heigh

# Articles further preprocessing (TBD)


# pandas DataFrame Pipeline

In [19]:
# Create dataframe from articles list
df = pd.DataFrame(data=articles)

In [20]:
df

Unnamed: 0,paper_id,authors,title,abstract,subjects,comments
0,9810034,"[Anastasia Doikou, Rafael I. Nepomechie]",Parity and Charge Conjugation Symmetries and S...,We formulate the notion of parity for the peri...,"[High Energy Physics - Theory, Exactly Solvabl...",20
1,0201111,[G. Papadopoulos],KT and HKT Geometries in Strings and in Black ...,Some selected applications of KT and HKT geome...,"[High Energy Physics - Theory, Differential Ge...",26
2,9210111,[A. P. Balachandran],"Gauge Symmetries,Topology and Quantisation",The following two loosely connected sets of to...,[],74
3,9303104,[P. Berglund],Dimensionally Reduced Landau-Ginzburg Orbifold...,"It is observed that a large class of $(2,2)$ s...",[],10
4,9509068,"[Tae Seong Kim, Won Ho Kye, Jae Kwan Kim]",The Dynamical Behaviors in (2+1)-Dimensional G...,We analyze (2+1)-dimensional Gross-Neveu model...,[],19
...,...,...,...,...,...,...
29550,0210293,[Dmitri Antonov],Finite-temperature properties of the supersymm...,The finite-temperature properties of supersymm...,[],8
29551,9310065,[Aurelian Isar],Wigner distribution function for the harmonic ...,Time evolution of the expectation values of va...,[],17
29552,0103002,[Kazuto Oshima],Critical Coupling in (1+1)-Dimensional Light-F...,Spontaneous symmetry breaking in (1+1)-dimensi...,[],21
29553,9406137,"[Cesar Gomez, Henri Ruegg, Philippe Zaugg]",Lattice Poincare as a quantum deformed algebra,We propose a definition of a Poincar\'e algebr...,[],10


## DataFrame quick analyse

In [21]:
def flat_list(data):
    return [item for sublist in data for item in sublist]

In [22]:
# Amount of unique authors
len(set(flat_list(df["authors"])))

13191

In [23]:
#Unique authors
set(flat_list(df["authors"]))

{'Domenec Espriu',
 'M. Maruyama',
 'V. V. Kuratov',
 'R. Rajaraman',
 'John Uglum',
 'B. Wehefritz Kaufmann',
 'B. Schroer',
 'Alexander G. Abanov',
 'V. S. Alves',
 'Chong Oh Lee',
 'G. T. Ter Kazarian',
 'Jeroen Wijnhout',
 'N. Sanchez',
 'Floriana Gargiulo',
 'Antonio J. Segui Santonja',
 'Richard Shurtleff',
 'Takahiro Tanaka',
 'Piotr Kosinski',
 'Anirvan M. Sengupta',
 'C. Itoi',
 'G. L. Huang',
 'A. Giaquinto',
 'M. Sato',
 'A. Mondragon',
 'Michel Rausch de Traubenberg',
 'W. S. l Yi',
 'Graziano Vernizzi',
 'Luigi Tedesco',
 'Robin Horan',
 'Alejandro Rivero',
 'A. P. Flitney',
 'V. E. Markushin',
 'Benjamin Grinstein',
 'Matthew Lippert',
 'P. Hajicek',
 'Karim A. Malik',
 'P. Sundell',
 'Igor N. Nikitin',
 'Harunobu Kubo',
 'Brent Nelson',
 'Andrei Micu',
 'G. K. Savvidy',
 'R. A. Leo',
 'Takesi Suzuki',
 'W. Bietenholz',
 'J. Santiago',
 'Antero Hietamaki',
 'M. Oleszczuk',
 'Ingo Runkel',
 'T. Haugset',
 'F. Aigner',
 'G. H. Lee',
 'Barry Mc Coy',
 'J. Saavedra',
 'Konsta

## Representation fixes

In [52]:
# Make Authors in one string with comma as sepparator
df["authors"] = df["authors"].map(lambda x: str(x).replace("[", "")).map(lambda x: str(x).replace("]", "")).map(lambda x: str(x).replace("'", ""))

In [53]:
# Make Subjects in one string with comma as sepparator
df["subjects"] = df["subjects"].map(lambda x: str(x).replace("[", "")).map(lambda x: str(x).replace("]", "")).map(lambda x: str(x).replace("'", ""))

In [54]:
df

Unnamed: 0,paper_id,authors,title,abstract,subjects,comments
0,9810034,"Anastasia Doikou, Rafael I. Nepomechie",Parity and Charge Conjugation Symmetries and S...,We formulate the notion of parity for the peri...,"High Energy Physics - Theory, Exactly Solvable...",20
1,0201111,G. Papadopoulos,KT and HKT Geometries in Strings and in Black ...,Some selected applications of KT and HKT geome...,"High Energy Physics - Theory, Differential Geo...",26
2,9210111,A. P. Balachandran,"Gauge Symmetries,Topology and Quantisation",The following two loosely connected sets of to...,,74
3,9303104,P. Berglund,Dimensionally Reduced Landau-Ginzburg Orbifold...,"It is observed that a large class of $(2,2)$ s...",,10
4,9509068,"Tae Seong Kim, Won Ho Kye, Jae Kwan Kim",The Dynamical Behaviors in (2+1)-Dimensional G...,We analyze (2+1)-dimensional Gross-Neveu model...,,19
...,...,...,...,...,...,...
29550,0210293,Dmitri Antonov,Finite-temperature properties of the supersymm...,The finite-temperature properties of supersymm...,,8
29551,9310065,Aurelian Isar,Wigner distribution function for the harmonic ...,Time evolution of the expectation values of va...,,17
29552,0103002,Kazuto Oshima,Critical Coupling in (1+1)-Dimensional Light-F...,Spontaneous symmetry breaking in (1+1)-dimensi...,,21
29553,9406137,"Cesar Gomez, Henri Ruegg, Philippe Zaugg",Lattice Poincare as a quantum deformed algebra,We propose a definition of a Poincar\'e algebr...,,10


In [55]:
df.to_csv("clear_citation_metadata.csv")

In [63]:
cit = pd.read_csv("clear_citation_metadata.csv", index_col=0)

In [64]:
cit

Unnamed: 0,paper_id,authors,title,abstract,subjects,comments
0,9810034,"Anastasia Doikou, Rafael I. Nepomechie",Parity and Charge Conjugation Symmetries and S...,We formulate the notion of parity for the peri...,"High Energy Physics - Theory, Exactly Solvable...",20
1,201111,G. Papadopoulos,KT and HKT Geometries in Strings and in Black ...,Some selected applications of KT and HKT geome...,"High Energy Physics - Theory, Differential Geo...",26
2,9210111,A. P. Balachandran,"Gauge Symmetries,Topology and Quantisation",The following two loosely connected sets of to...,,74
3,9303104,P. Berglund,Dimensionally Reduced Landau-Ginzburg Orbifold...,"It is observed that a large class of $(2,2)$ s...",,10
4,9509068,"Tae Seong Kim, Won Ho Kye, Jae Kwan Kim",The Dynamical Behaviors in (2+1)-Dimensional G...,We analyze (2+1)-dimensional Gross-Neveu model...,,19
...,...,...,...,...,...,...
29550,210293,Dmitri Antonov,Finite-temperature properties of the supersymm...,The finite-temperature properties of supersymm...,,8
29551,9310065,Aurelian Isar,Wigner distribution function for the harmonic ...,Time evolution of the expectation values of va...,,17
29552,103002,Kazuto Oshima,Critical Coupling in (1+1)-Dimensional Light-F...,Spontaneous symmetry breaking in (1+1)-dimensi...,,21
29553,9406137,"Cesar Gomez, Henri Ruegg, Philippe Zaugg",Lattice Poincare as a quantum deformed algebra,We propose a definition of a Poincar\'e algebr...,,10


---

# Testing

## Name Concatenations Testing

In [33]:
def test_preprocess_name_concatenations(names : List[str]) -> List[str]:
    new_names = []
    for name in names:
        while True:
            search = re.search(r'([a-z]|[A-Z])[A-Z]', name)
            if search:
                name = (name[0:search.start()+1] + " " + name[search.end()-1:])
            else:
                break
        new_names.append(name)
    return new_names

In [34]:
old_names = ["KarlLie", "KarlLieBern", "Karl Lie", "Karllie", "KL Bern"]
new_names = test_preprocess_name_concatenations(old_names)

In [35]:
print(old_names)
print(new_names)

['KarlLie', 'KarlLieBern', 'Karl Lie', 'Karllie', 'KL Bern']
['Karl Lie', 'Karl Lie Bern', 'Karl Lie', 'Karllie', 'K L Bern']


In [36]:
new_names

['Karl Lie', 'Karl Lie Bern', 'Karl Lie', 'Karllie', 'K L Bern']

## Name Initials Testing

In [37]:
def test_preprocess_name_initials(names : List[str]) -> List[str]:
    new_names = []
    for name in names:
        while True:
            search = re.search(r'[A-Z]\b(?!\.)', name)
            if search:
                name = name[0:search.start()+1] + "." + name[search.end():]
            else:
                break
        new_names.append(name)
    return new_names

In [38]:
old_names = ["M J Fox", "M. J. Fox", "Ni Yo G", "Viktor T Stanford"]
new_names = test_preprocess_name_initials(old_names)

In [39]:
print(old_names)
print(new_names)

['M J Fox', 'M. J. Fox', 'Ni Yo G', 'Viktor T Stanford']
['M. J. Fox', 'M. J. Fox', 'Ni Yo G.', 'Viktor T. Stanford']


In [40]:
new_names

['M. J. Fox', 'M. J. Fox', 'Ni Yo G.', 'Viktor T. Stanford']

## File Opening testing

In [None]:
test_file_opening("9912045.abs")

In [None]:
test_file_opening("9907137.abs")

In [None]:
test_file_opening()

## Preprocess Author Name Pipeline

In [44]:
preprocess_author_name_pipeline("""A. Rida (1), T. Sami (2) ((1) USTHB, Alger Algerie, (2) SUBATECH,  Nantes France)""")

'A. Rida , , T. Sami , , USTHB., Alger Algerie, , SUBATECH., Nantes France'

## Find articles

In [None]:
# Find article by author
author = "SUBATECH."
for article in articles:
    if author in article["authors"]:
        print(article)

In [None]:
# Is there an article with subjects?
for article in articles:
    if article["subjects"]:
        print(article)

In [47]:
# Find article with this subject
chosen_subject = "Statistical Mechanics  Paper"
for article in articles:
    for subject in article["subjects"]:
        if chosen_subject in subject:
            print(article["subjects"])

In [None]:
# Let's get all subjects
subject_set = set()
for article in articles:
    if article["subjects"]:
        subject_set.update(article["subjects"])
subject_set

## Testing file meta processing

In [49]:
testing_article = "9907137"
file_content = None
for fn, fc in files.items():
    if fn == testing_article:
        file_content = fc
        break

In [50]:
from typing import List
def file_meta_generation(file_content: str) -> List[str]:
    file_meta = []
    continous_line = True
    file_meta_lines = list(filter(lambda x: x, file_content.split("\n")))
    file_meta.append(file_meta_lines[0])
    for file_meta_line_index in range(1, len(file_meta_lines)):
        meta_line = file_meta_lines[file_meta_line_index]
        if (meta_line.startswith("Author")
            or meta_line.startswith("Title")
            or meta_line.startswith("Subj-class")
            or meta_line.startswith("Comments")
            ):
            file_meta.append(meta_line)
            continous_line = True
        elif meta_line.startswith(r"  ") and continous_line:
            file_meta[-1] = file_meta[-1] + meta_line
        else:
            continous_line = False
    return file_meta

In [51]:
# print(file_content)
splitted_file_content = file_content.split(r"\\")
# print(splitted_file_content)
# broken_file_meta = list(filter(lambda x: x, splitted_file_content[1].split("\n")))
# print(broken_file_meta)
fixed_file_meta = file_meta_generation(splitted_file_content[1])
print(fixed_file_meta)

['Paper: hep-th/9907137', 'Title: The non chiral fusion rules in rational conformal field theories', 'Authors: A. Rida (1), T. Sami (2) ((1) USTHB, Alger Algerie, (2) SUBATECH,  Nantes France)', 'Comments: 11 pages, corrected and completed version']
