In [8]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, URL, text
from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()
import os.path as osp
import ast

# Preprocess Cleantech Data

In [2]:
df_cleantech = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_appln_ids.csv')

In [None]:
url_patstat_object = URL.create(
    drivername="",
    username="",
    password="",
    host="",
    port="",
    database=""
)
patstat_engine = create_engine(url_patstat_object)

In [5]:
# Write df_cleantech to a temporary table in the database
df_cleantech.to_sql('temp_cleantech', patstat_engine, if_exists='replace', index=False)

208

In [6]:
# Perform a join between the temporary table and tls211_pat_publn to extract publn_auth and publn_nr.
query = """
    SELECT 
        t.appln_id, 
        p.publn_auth, 
        p.publn_nr
    FROM 
        temp_cleantech t
    JOIN 
        tls211_pat_publn p 
        ON t.appln_id::text = p.appln_id::text
"""

df_cleantech_publn = pd.read_sql_query(text(query), patstat_engine)

In [7]:
# Merge the two dataframes
df_cleantech = df_cleantech.merge(df_cleantech_publn, on='appln_id', how='left')

In [8]:
df_cleantech['publn_auth'].value_counts()

publn_auth
US    2975821
EP     915352
Name: count, dtype: int64

In [9]:
df_cleantech.to_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_appln_ids_publn_nr.csv', index=False)

# OpenAlex Works

## Preprocessing

In [37]:
df_rel_pcs = pd.read_csv("/mnt/hdd02/Projekt_EDV_TEK/Reliance_on_Science/_pcs_oa.csv")
df_rel_ppp = pd.read_csv("/mnt/hdd02/Projekt_EDV_TEK/Reliance_on_Science/_patent_paper_pairs.csv")  # ONLY US patents in the original dataset

In [38]:
df_rel_pcs = df_rel_pcs.astype(str)
df_rel_pcs['patent'] = df_rel_pcs['patent'].apply(lambda x: x.lower())
df_rel_pcs[['publn_auth', 'publn_nr', 'appln_kind']] = df_rel_pcs['patent'].str.split('-', n=2, expand=True)
df_rel_pcs['appln_kind'] = df_rel_pcs['appln_kind'].str.replace(' ', '')

In [39]:
df_rel_pcs['publn_auth'].value_counts()

publn_auth
us    34399567
ep     5671352
wo     4077885
cn     2157832
jp      354281
        ...   
bg          27
sk          11
cu           6
si           1
ec           1
Name: count, Length: 64, dtype: int64

In [40]:
df_rel_ppp = df_rel_ppp.astype(str)
df_rel_ppp['patent'] = df_rel_ppp['patent'].apply(lambda x: x.lower())
splits = df_rel_ppp['patent'].str.split('-', n=2, expand=True)
if splits.shape[1] < 3:
    splits = splits.reindex(columns=range(3))
splits.columns = ['publn_auth', 'publn_nr', 'appln_kind']
df_rel_ppp = pd.concat([df_rel_ppp, splits], axis=1)

In [14]:
df_rel_ppp['publn_auth'].value_counts()

publn_auth
us    548315
Name: count, dtype: int64

**As there are only US patents in the Patent-Paper-Pair dataset, we will focus from here on only on US Patents**

In [41]:
# Filter df_rel_pcs to only include patents where the publication authority is US
df_rel_pcs = df_rel_pcs[df_rel_pcs['publn_auth'] == 'us']

In [42]:
# Filter df_cleantech to only include patents where the publication authority is US
df_cleantech = df_cleantech[df_cleantech['publn_auth'] == 'US']

In [21]:
df_rel_pcs.head()

Unnamed: 0,reftype,confscore,oaid,patent,uspto,wherefound,self,publn_auth,publn_nr,appln_kind,patent_paper_pair
0,app,10,W1552,us-11426570-b2,1,frontonly,notself,us,11426570,b2,0
1,app,10,W1552,us-11666239-b2,1,frontonly,notself,us,11666239,b2,0
2,app,10,W1552,us-11678989-b2,1,frontonly,notself,us,11678989,b2,0
3,app,10,W1552,us-11745001-b2,1,frontonly,notself,us,11745001,b2,0
4,app,10,W1552,us-11826495-b2,1,frontonly,notself,us,11826495,b2,0


In [20]:
df_rel_ppp.head()

Unnamed: 0,oaid,patent,ppp_score,daysdiffcont,all_patents_for_the_same_paper,publn_auth,publn_nr,appln_kind,patent_paper_pair
0,W2025049717,us-10000036,1,-1360,,us,10000036,,1
1,W4234301399,us-10000103,2,-342,,us,10000103,,1
2,W2731860728,us-10000103,2,-761,,us,10000103,,1
3,W2346689143,us-10000103,2,-453,,us,10000103,,1
4,W2624698886,us-10000305,2,-1859,,us,10000305,,1


In [43]:
df_rel_pcs['oaid'] = 'W' + df_rel_pcs['oaid']
df_rel_ppp = df_rel_ppp.rename(columns={'paperid': 'oaid'})
df_rel_pcs['patent_paper_pair'] = 0
df_rel_ppp['patent_paper_pair'] = 1
df_rel_pcs = df_rel_pcs.astype(str)
df_rel_ppp = df_rel_ppp.astype(str)

df_rel = pd.concat([df_rel_pcs, df_rel_ppp], ignore_index=True)
df_rel = df_rel.reset_index(drop=True)

In [44]:
df_rel.head()

Unnamed: 0,reftype,confscore,oaid,patent,uspto,wherefound,self,publn_auth,publn_nr,appln_kind,patent_paper_pair,ppp_score,daysdiffcont,all_patents_for_the_same_paper
0,app,10,W1552,us-11426570-b2,1,frontonly,notself,us,11426570,b2,0,,,
1,app,10,W1552,us-11666239-b2,1,frontonly,notself,us,11666239,b2,0,,,
2,app,10,W1552,us-11678989-b2,1,frontonly,notself,us,11678989,b2,0,,,
3,app,10,W1552,us-11745001-b2,1,frontonly,notself,us,11745001,b2,0,,,
4,app,10,W1552,us-11826495-b2,1,frontonly,notself,us,11826495,b2,0,,,


In [45]:
# Filter df_rel to only include publn_nr which are in df_cleantech
df_rel = df_rel[df_rel['publn_nr'].isin(df_cleantech['publn_nr'])]

In [46]:
df_rel_oaid = df_rel[['oaid']]
df_rel_oaid = df_rel_oaid.drop_duplicates()

### Save some preliminary data for Deep Learning for Social Analytics Course

In [None]:
# df_rel_ppp_course = df_rel_ppp.loc[:, ['patent_id', 'oaid']]
# df_rel_ppp_course.to_csv("/home/thiesen/Documents/Projekt_EDV-TEK/Deep Learning for Social Analytics - Project Data/patent_paper_pairs.csv", index=False)

In [None]:
# df_rel_pcs_course = df_rel_pcs.loc[:, ['patent_id', 'oaid']]
# df_rel_pcs_course.to_csv("/home/thiesen/Documents/Projekt_EDV-TEK/Deep Learning for Social Analytics - Project Data/patent_paper_citations.csv", index=False)

### Extract Works from Postgres OpenAlex

In [None]:
url_openalex_object = URL.create(
    drivername="",
    username="",
    password="",
    host="",
    port="",
    database=""
)
engine_openalex = create_engine(url_openalex_object)

In [26]:
with engine_openalex.begin() as connection:
    connection.execute(text("""
        CREATE TEMPORARY TABLE temp_oaid (
            oaid VARCHAR PRIMARY KEY
        )
    """))
    oaid_prefixed = ['https://openalex.org/' + str(oaid) for oaid in df_rel_oaid['oaid']]
    for oaid in tqdm(oaid_prefixed):
        connection.execute(text("INSERT INTO temp_oaid (oaid) VALUES (:oaid)"), {'oaid': oaid})

  0%|          | 0/1696718 [00:00<?, ?it/s]

In [27]:
df_rel_postgres = pd.read_sql_query("""
    SELECT w.id, w.title, w.abstract
    FROM openalex.works AS w
    JOIN temp_oaid AS t ON w.id = t.oaid
""", con=engine_openalex)
df_rel_postgres = df_rel_postgres.drop_duplicates(subset=['id'])

In [29]:
df_rel_postgres['oaid'] = df_rel_postgres['id'].apply(lambda x: x.replace('https://openalex.org/', ''))

In [49]:
df_rel = df_rel.merge(df_rel_postgres, on='oaid', how='left')

In [50]:
# Count number of rows in df_rel where title is NaN
df_rel['title'].isna().sum()

136827

In [51]:
df_rel_postgres.to_csv("/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_openalex_title_abstract.csv", index=False)

In [None]:
# df_rel_postgres_course = df_rel_postgres.loc[:, ['oaid', 'title', 'abstract']]
# df_rel_postgres_course.to_csv("/home/thiesen/Documents/Projekt_EDV-TEK/Deep Learning for Social Analytics - Project Data/paper.csv", index=False)

### Extract Authors from Postgres OpenAlex

In [52]:
df_rel_authors_postgres = pd.read_sql_query("""
    SELECT a.work_id, a.author_id
    FROM openalex.works_authorships as a
    JOIN temp_oaid AS t ON a.work_id = t.oaid
""", con=engine_openalex)
df_rel_authors_postgres = df_rel_authors_postgres.drop_duplicates(subset=['author_id', 'work_id'])

In [53]:
df_rel_authors_postgres_grouped = df_rel_authors_postgres.groupby('author_id')['work_id'].apply(list).reset_index()

In [54]:
with engine_openalex.begin() as connection:
    connection.execute(text("""
        CREATE TEMPORARY TABLE temp_author_id (
            author_id VARCHAR PRIMARY KEY
        )
    """))
    for author_id in tqdm(df_rel_authors_postgres_grouped['author_id']):
        connection.execute(text("INSERT INTO temp_author_id (author_id) VALUES (:author_id)"), {'author_id': author_id})

  0%|          | 0/2642635 [00:00<?, ?it/s]

In [55]:
df_rel_authors_info_postgres = pd.read_sql_query("""
    SELECT a.id, a.display_name, a.display_name_alternatives
    FROM openalex.authors as a
    JOIN temp_author_id AS t ON a.id = t.author_id
""", con=engine_openalex)
df_rel_authors_info_postgres = df_rel_authors_info_postgres.drop_duplicates(subset=['display_name'])

In [56]:
df_rel_authors_complete = pd.merge(df_rel_authors_postgres_grouped, df_rel_authors_info_postgres, left_on='author_id', right_on='id', how='inner')

In [59]:
df_rel_authors_complete['oaid'] = df_rel_authors_complete['work_id'].apply(lambda x: [i.replace("https://openalex.org/W", "") for i in x])

In [61]:
df_rel_authors_complete.to_csv("/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_openalex_authors.csv", index=False)

### Extract Paper Citations from Postgres OpenAlex

In [63]:
df_rel_citations_postgres = pd.read_sql_query("""
    SELECT w.work_id, w.referenced_work_id
    FROM openalex.works_referenced_works as w
    JOIN temp_oaid AS t1 ON w.work_id = t1.oaid
    JOIN temp_oaid AS t2 ON w.referenced_work_id = t2.oaid
""", con=engine_openalex)
df_rel_citations_postgres = df_rel_citations_postgres.drop_duplicates(subset=['work_id', 'referenced_work_id'])

In [68]:
df_rel_citations_postgres.to_csv("/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_openalex_citations.csv", index=False)

In [67]:
df_rel_citations_postgres['oaid'] = df_rel_citations_postgres['work_id'].str.replace("https://openalex.org/W", "")
df_rel_citations_postgres['cited_oaid'] = df_rel_citations_postgres['referenced_work_id'].str.replace("https://openalex.org/W", "")

In [None]:
# df_rel_citations_postgres_course = df_rel_citations_postgres.loc[:, ['oaid', 'cited_oaid']]
# df_rel_citations_postgres_course.to_csv("/home/thiesen/Documents/Projekt_EDV-TEK/Deep Learning for Social Analytics - Project Data/paper_paper_citations.csv", index=False)