In [1]:
# Importing the patstat client
from epo.tipdata.patstat import PatstatClient

# Initialize the PATSTAT client
patstat = PatstatClient(env='PROD')

# Access ORM
db = patstat.orm()

# Importing tables as models
from epo.tipdata.patstat.database.models import TLS201_APPLN,TLS203_APPLN_ABSTR, TLS202_APPLN_TITLE, TLS209_APPLN_IPC, TLS224_APPLN_CPC, TLS206_PERSON, TLS207_PERS_APPLN

In [2]:
from sqlalchemy import and_, or_
# Keywords for the abstract and title
keywords = [
    "rare earth element*", "light REE*", "heavy REE*", "rare earth metal*",
    "rare earth oxide*", "lanthan*", "rare earth"
]
# Step 1: Retrieve distinct docdb_family_id values corresponding to keywords
subquery_keywords = (
    db.query(TLS201_APPLN.docdb_family_id)
    .join(TLS203_APPLN_ABSTR, TLS203_APPLN_ABSTR.appln_id == TLS201_APPLN.appln_id)
    .filter(or_(*[TLS203_APPLN_ABSTR.appln_abstract.contains(kw) for kw in keywords]))
    .union(
        db.query(TLS201_APPLN.docdb_family_id)
        .join(TLS202_APPLN_TITLE, TLS202_APPLN_TITLE.appln_id == TLS201_APPLN.appln_id)
        .filter(or_(*[TLS202_APPLN_TITLE.appln_title.contains(kw) for kw in keywords]))
    ).distinct()
).all()

# Convert result to a list of docdb_family_id values
docdb_family_ids_keywords = [row.docdb_family_id for row in subquery_keywords]
# Print the number of distinct docdb_family_id values
print(f"Number of distinct docdb_family_id values: {len(docdb_family_ids_keywords)}")

Number of distinct docdb_family_id values: 84905


In [3]:
from sqlalchemy import func

# Classification codes with varying lengths

ipc_codes_11 = [
'A43B   1/12','B03B   9/06','B29B   7/66','B30B   9/32','B65D  65/46','C03B   1/02',
'C04B   7/24','C04B   7/26','C04B   7/28','C04B   7/30','C04B  11/26','C04B  18/04','C04B  18/06','C04B  18/08','C04B  18/10',
'C04B  18/12','C04B  18/14','C04B  18/16','C04B  18/18','C04B  18/20','C04B  18/22','C04B  18/24','C04B  18/26','C04B  18/28',
'C04B  18/30','C09K  11/01','C22B  19/28','C22B  19/30','C22B  25/06','D21B   1/08','D21B   1/10','D21B   1/32','D21C   5/02',
'D21H  17/01','H01B  15/00','H01J   9/52','H01M   6/52','H01M  10/54']

ipc_codes_8 = [
'B22F   8','B29B  17','B62D  67','B65H  73',
'C08J  11','C10M 175','C22B   7','D01G  11']

ipc_codes_12 = [
'C04B  33/132']

cpc_codes_11 = ['A43B   1/12','B03B   9/06','B29B   7/66','B30B   9/32','B65D  65/46','C03B   1/02',
'C04B   7/24','C04B   7/26','C04B   7/28','C04B   7/30','C04B  11/26','C04B  18/04','C04B  18/06','C04B  18/08','C04B  18/10',
'C04B  18/12','C04B  18/14','C04B  18/16','C04B  18/18','C04B  18/20','C04B  18/22','C04B  18/24','C04B  18/26','C04B  18/28',
'C04B  18/30','C09K  11/01','C22B  19/28','C22B  19/30','C22B  25/06','D21B   1/08','D21B   1/10','D21B   1/32','D21C   5/02',
'D21H  17/01','H01B  15/00','H01J   9/52','H01M   6/52','H01M  10/54','Y02W  30/50','Y02W  30/52','Y02W  30/56','Y02W  30/58',
'Y02W  30/60','Y02W  30/62','Y02W  30/64','Y02W  30/66','Y02W  30/74','Y02W  30/78','Y02W  30/80','Y02W  30/82','Y02W  30/84',
'Y02W  30/91','Y02P  10/20']

cpc_codes_8 = ['B22F   8','B29B  17','B62D  67','B65H  73',
'C08J  11','C10M 175','C22B   7','D01G  11']

cpc_codes_12 = ['C04B  18/068','C04B  33/132',
'C04B   7/243','C04B   7/246','C04B  18/049','C04B  18/061','C04B  18/062','C04B  18/064','C04B  18/065',
'C04B  18/067','C04B  18/081','C04B  18/082','C04B  18/084','C04B  18/085','C04B  18/087','C04B  18/088',
'C04B  18/101','C04B  18/103','C04B  18/105','C04B  18/106','C04B  18/108','C04B  18/125',
'C04B  18/141','C04B  18/142','C04B  18/143','C04B  18/144','C04B  18/145','C04B  18/146','C04B  18/147',
'C04B  18/148','C04B  18/149','C04B  18/162','C04B  18/165','C04B  18/167','C04B  18/241','C04B  18/243',
'C04B  18/245','C04B  18/246','C04B  18/248','C04B  18/265','C04B  18/305']


cpc_codes_13 = ['C04B  18/0409','C04B  18/0418',
'C04B  18/0427','C04B  18/0436','C04B  18/0445','C04B  18/0454','C04B  18/0463','C04B  18/0472','C04B  18/0481']





# Subquery for classification codes with varying lengths
subquery_classcodes = (
    db.query(TLS201_APPLN.docdb_family_id)
    .join(TLS209_APPLN_IPC, TLS209_APPLN_IPC.appln_id == TLS201_APPLN.appln_id)
    .filter(
        or_(
            func.substr(TLS209_APPLN_IPC.ipc_class_symbol, 1, 11).in_(ipc_codes_11),
            func.substr(TLS209_APPLN_IPC.ipc_class_symbol, 1, 8).in_(ipc_codes_8),
            func.substr(TLS209_APPLN_IPC.ipc_class_symbol, 1, 12).in_(ipc_codes_12)
        )
    )
    .union(
        db.query(TLS201_APPLN.docdb_family_id)
        .join(TLS224_APPLN_CPC, TLS224_APPLN_CPC.appln_id == TLS201_APPLN.appln_id)
        .filter(
            or_(
                func.substr(TLS224_APPLN_CPC.cpc_class_symbol, 1, 11).in_(cpc_codes_11),
                func.substr(TLS224_APPLN_CPC.cpc_class_symbol, 1, 8).in_(cpc_codes_8),
                func.substr(TLS224_APPLN_CPC.cpc_class_symbol, 1, 12).in_(cpc_codes_12),
                func.substr(TLS224_APPLN_CPC.cpc_class_symbol, 1, 12).in_(cpc_codes_13)
            )
        )
    ).distinct()
).all()

# Convert result to a list of docdb_family_id values
docdb_family_ids_classcodes = [row.docdb_family_id for row in subquery_classcodes]

# Print the number of results
print(f"Number of results in Step 2: {len(docdb_family_ids_classcodes)}")

Number of results in Step 2: 567012


In [4]:
# Step 3: Get the intersection of the two lists
intersection_docdb_family_ids = list(set(docdb_family_ids_keywords) & set(docdb_family_ids_classcodes))

In [5]:
# Step 4: Filter the data according to the timeframe 2010 - 2022
final_query = (
    db.query(func.count(func.distinct(TLS201_APPLN.docdb_family_id)).label('Famiglie'))
    .filter(TLS201_APPLN.docdb_family_id.in_(intersection_docdb_family_ids))
    .filter(TLS201_APPLN.earliest_filing_year.between(2010, 2022))
)

# Execute the query
result = final_query.all()
print(result)

[(4313,)]


In [6]:
import pandas as pd
query = db.query(
    TLS206_PERSON.psn_name,
    func.count(func.distinct(TLS201_APPLN.docdb_family_id)).label('distinct_patent_families')
).join(
    TLS207_PERS_APPLN, TLS206_PERSON.person_id == TLS207_PERS_APPLN.person_id
).join(
    TLS201_APPLN, TLS207_PERS_APPLN.appln_id == TLS201_APPLN.appln_id
).filter(
    TLS207_PERS_APPLN.applt_seq_nr != 0,
    TLS201_APPLN.docdb_family_id.in_(intersection_docdb_family_ids),
    TLS201_APPLN.earliest_filing_year.between(2010, 2022)
).group_by(
    TLS206_PERSON.psn_name
).order_by(
    func.count(func.distinct(TLS201_APPLN.docdb_family_id)).desc()
).all()

# Print the results
#for result in query:
#   print(f"Applicant: {result.psn_name}, Distinct Patent Families: {result.distinct_patent_families}")

# Convert the results to a DataFrame and display it
df = pd.DataFrame(query, columns=['Applicant', 'Distinct Patent Families'])
print(df)

                                           Applicant  Distinct Patent Families
0       JIANGXI UNIVERSITY OF SCIENCE AND TECHNOLOGY                       179
1                        CHINESE ACADEMY OF SCIENCES                       103
2                BAOTOU IRON & STEEL (GROUP) COMPANY                        68
3           BAOTOU RESEARCH INSTITUTE OF RARE EARTHS                        51
4                            NORTHEASTERN UNIVERSITY                        47
...                                              ...                       ...
2450                          CENTRAL AMERICA NICKEL                         1
2451  ZIBO CENTER FOR DISEASE CONTROL AND PREVENTION                         1
2452                            TUDOR, CATALINA OANA                         1
2453                   JIANGSU BEINAI ALLOYS COMPANY                         1
2454                                     ZHONG JIN'E                         1

[2455 rows x 2 columns]


In [7]:
import plotly.express as px


# Segue la versione più bella che mi permette di visualizzare sia la lista di dati che il grafico scatter con i più significativi

In [8]:

# Filter out applicants with only 1 or 2 patent families
df_filtered = df[df['Distinct Patent Families'] > 2]

# Create a scatter plot
fig = px.scatter(df_filtered, x='Distinct Patent Families', y='Applicant', title='Applicants by Distinct Patent Families', hover_data=['Applicant'], color='Distinct Patent Families')

# Hide the y-axis labels
fig.update_layout(yaxis=dict(showticklabels=False))

# Save the plot as an HTML file
fig_html = fig.to_html(full_html=False)

# Display the DataFrame in a scrollable format
html_table = df.to_html(classes='table table-striped', index=False)

html_content = f"""
<!DOCTYPE html>
<html>
<head>
    <title>Applicants by Distinct Patent Families</title>
    <style>
        .scrollable-table {{
            height: 400px;
            overflow-y: scroll;
            display: block;
        }}
        .table {{
            width: 100%;
            border-collapse: collapse;
        }}
        .table th, .table td {{
            border: 1px solid #ddd;
            padding: 8px;
        }}
        .table th {{
            background-color: #f2f2f2;
            text-align: left;
        }}
    </style>
</head>
<body>
    <h1>Applicants by Distinct Patent Families</h1>
    <div class="scrollable-table">
        {html_table}
    </div>
    <div>
        {fig_html}
    </div>
</body>
</html>
"""

with open("applicants_dashboard.html", "w") as file:
    file.write(html_content)

print("The combined dashboard has been saved as 'applicants_dashboard.html'. You can download it from your working directory.")

The combined dashboard has been saved as 'applicants_dashboard.html'. You can download it from your working directory.
