In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

Loading all the dataset

In [67]:
labels_df = pd.read_csv(r"A:\DLSA\Project_work\Datasets\negative_sampling1.csv")
labels_df = labels_df[["paperid", "label"]].drop_duplicates("paperid")

In [68]:
print(labels_df["label"].value_counts())

label
0    568954
1    190416
Name: count, dtype: int64


In [69]:
works = pd.read_csv("A:/DLSA/Project_work/Datasets/clean_works1.csv", low_memory=False).iloc[:, 1:]
works = works.drop_duplicates("paperid")
works = works[works["paperid"].isin(labels_df["paperid"])].copy()

For every dataset we check whether the paperid is present already in the labels_df (so that we filter out the pure form dataset)
This way we have the perfect dataset filtered on present relevance and unknown dataset

In [70]:
paper_features = works[["paperid", "publication_year","language", "type", "title", "abstract"]].copy()
paper_features["title"] = paper_features["title"].fillna("")
paper_features["abstract"] = paper_features["abstract"].fillna("")

paper_features["publication_year"] = pd.to_numeric(paper_features["publication_year"], errors="coerce")
paper_features = paper_features[(paper_features["publication_year"] >= 1990) & (paper_features["publication_year"] <= 2015)].copy()
allowed_paperids = set(paper_features["paperid"])
labels_df = labels_df[labels_df["paperid"].isin(allowed_paperids)].copy()
works = works[works["paperid"].isin(allowed_paperids)].copy()

paper_features["language"] = (paper_features["language"].fillna("unknown").astype(str).str.lower().str.strip())
paper_features["is_eng"] = (paper_features["language"] == "en").astype(np.int8)
paper_features = paper_features.drop(columns=["language"])
paper_features["abstract_len"] = paper_features["abstract"].astype(str).str.split().str.len()
paper_features["title_len"] = paper_features["title"].astype(str).str.split().str.len()
#paper_features["is_article"] = (paper_features["type"] == "article").astype(int)
paper_features["type"] = paper_features["type"].fillna("other").str.lower()
core_research = {"article","review","book-chapter"}
sec_research = {"preprint","letter","dissertation","other","book","dataset","report"}
low_novelty ={"paratext","editorial","erratum","reference-entry","libguides","peer-review","retraction","supplementary-materials"}
paper_features["core_research"] = paper_features["type"].isin(core_research).astype(np.int8)
paper_features["sec_research"] = paper_features["type"].isin(sec_research).astype(np.int8)
paper_features["low_novelty"] = paper_features["type"].isin(low_novelty).astype(np.int8)
paper_features = paper_features.drop(columns=["type"])

In [71]:
works_authorships = pd.read_csv("A:/DLSA/Project_work/Datasets1/works_authorships.csv",low_memory=False)
works_authorships["paperid"] = (works_authorships["work_id"].str.split(".org/").str[1])
works_authorships["author_id"] = (works_authorships["author_id"].str.split(".org/").str[1])
works_authorships["institution_id"] = (works_authorships["institution_id"].str.split(".org/").str[1])
works_authorships["work_id"] = (works_authorships["work_id"].str.split(".org/").str[1])
works_authorships = works_authorships[works_authorships["paperid"].isin(allowed_paperids)].copy()

In [72]:
authors_counts = pd.read_csv("A:/DLSA/Project_work/Datasets1/authors_counts_by_year.csv")
authors_counts["author_id"] = authors_counts["author_id"].str.split(".org/").str[1]
authors_counts["year"] = pd.to_numeric(authors_counts["year"], errors="coerce")
authors_counts["works_count"] = pd.to_numeric(authors_counts["works_count"], errors="coerce")
authors_counts["cited_by_count"] = pd.to_numeric(authors_counts["cited_by_count"], errors="coerce")

In [73]:
paper_year = paper_features[["paperid", "publication_year"]].copy()
paper_year["publication_year"] = pd.to_numeric(paper_year["publication_year"], errors="coerce")
pa = works_authorships.merge(paper_year, on="paperid", how="left")
pa["publication_year"] = pa["publication_year"].fillna(-1).astype(int)

In [74]:
pa = pa[["paperid", "author_id", "publication_year"]].copy()
authors_counts = authors_counts[["author_id", "year", "works_count", "cited_by_count"]].copy()
authors_counts = authors_counts[(authors_counts["year"] >= 1990) & (authors_counts["year"] <= 2015)].copy()
needed_authors = pa["author_id"].dropna().unique()
authors_counts = authors_counts[authors_counts["author_id"].isin(needed_authors)].copy()
pa["publication_year"] = pa["publication_year"].astype(np.int16, errors="ignore")
authors_counts["year"] = authors_counts["year"].astype(np.int16, errors="ignore")
authors_counts["works_count"] = pd.to_numeric(authors_counts["works_count"], errors="coerce").fillna(0).astype(np.float32)
authors_counts["cited_by_count"] = pd.to_numeric(authors_counts["cited_by_count"], errors="coerce").fillna(0).astype(np.float32)
pa_ay = pa.merge(authors_counts, on="author_id", how="left")
pa_ay = pa_ay[pa_ay["year"].notna()]
pa_ay["year"] = pa_ay["year"].astype(int)
pa_ay_hist = pa_ay[pa_ay["year"] < pa_ay["publication_year"]].copy()

In [75]:
author_hist_per_paper_author = (pa_ay_hist.groupby(["paperid", "author_id"]).agg(author_citations_asof=("cited_by_count", "mean"),
        author_productivity_asof=("works_count", "mean")).reset_index())

In [76]:
author_features = (author_hist_per_paper_author.groupby("paperid").agg( num_authors=("author_id", "nunique"),
        avg_author_citations=("author_citations_asof", "mean"),avg_author_productivity=("author_productivity_asof", "mean")).reset_index())

In [77]:
institutions = pd.read_csv("A:/DLSA/Project_work/Datasets1/institutions.csv")
institutions["id"] = institutions["id"].str.split(".org/").str[1]
institutions["institutions_name"] = institutions["display_name"].fillna("")

In [78]:
inst_clean = institutions[["id", "institutions_name", "works_count", "cited_by_count"]].copy()
inst_clean["works_count"] = pd.to_numeric(inst_clean["works_count"], errors="coerce").fillna(0)
inst_clean["cited_by_count"] = pd.to_numeric(inst_clean["cited_by_count"], errors="coerce").fillna(0)
paper_inst = (works_authorships[["paperid", "institution_id"]].dropna().drop_duplicates().merge(inst_clean, left_on="institution_id", right_on="id", how="left"))

In [79]:
paper_inst["institutions_name"] = paper_inst["institutions_name"].fillna("")
paper_inst["works_count"] = paper_inst["works_count"].fillna(0)
paper_inst["cited_by_count"] = paper_inst["cited_by_count"].fillna(0)
paper_inst = paper_inst.drop(columns=["id"])
#inst_name_df = (paper_inst[["paperid", "institutions_name"]].dropna().drop_duplicates().groupby("paperid").first().reset_index())

In [80]:
training_years = 2010
tradeoff = 15
paper_inst_year = paper_inst.merge(paper_features[["paperid", "publication_year"]],on="paperid",how="left")
top_institutions = (paper_inst_year[paper_inst_year["publication_year"] <= training_years].groupby("institutions_name")["paperid"].nunique()
    .sort_values(ascending=False).head(tradeoff).index)

In [81]:
paper_inst["is_top_institution"] = (paper_inst["institutions_name"].isin(top_institutions)).astype(np.int8)
top_inst_flag = (paper_inst.groupby("paperid")["is_top_institution"].max().reset_index())

In [82]:
#inst_name_df = (paper_inst[["paperid", "institutions_name"]].dropna().drop_duplicates().groupby("paperid").first().reset_index())

In [83]:
feature_df = (paper_features.merge(author_features, on="paperid", how="left").merge(top_inst_flag, on="paperid", how="left"))

In [85]:
#feature_df["institutions_name"] = feature_df["institutions_name"].fillna("")
feature_df["title"] = feature_df["title"].fillna("")
feature_df["abstract"] = feature_df["abstract"].fillna("")
num_cols = feature_df.select_dtypes(include=["number"]).columns
feature_df[num_cols] = feature_df[num_cols].fillna(0)
feature_df["is_top_institution"] = feature_df["is_top_institution"].fillna(0).astype(np.int8)

In [86]:
final_df = labels_df.merge(feature_df, on="paperid", how="left")
final_df["title"] = final_df["title"].fillna("")
final_df["abstract"] = final_df["abstract"].fillna("")
#final_df["institutions_name"] = final_df["institutions_name"].fillna("")

In [87]:
final_df.to_csv("A:/DLSA/Project_work/Datasets/forecasting_dataset3.csv",index=False)

In [88]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 464672 entries, 0 to 464671
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   paperid                  464672 non-null  object 
 1   label                    464672 non-null  int64  
 2   publication_year         464672 non-null  int64  
 3   title                    464672 non-null  object 
 4   abstract                 464672 non-null  object 
 5   is_eng                   464672 non-null  int8   
 6   abstract_len             464672 non-null  int64  
 7   title_len                464672 non-null  int64  
 8   core_research            464672 non-null  int8   
 9   sec_research             464672 non-null  int8   
 10  low_novelty              464672 non-null  int8   
 11  num_authors              464672 non-null  float64
 12  avg_author_citations     464672 non-null  float32
 13  avg_author_productivity  464672 non-null  float32
 14  is_t