# Alcune verifiche dell'approccio REE:

### si tratta di riprodurre alcuni risultati già trovati con Patstat online tramite SQLAlchemy. Alcune strategie sono comunque utili per analisi di Patent Intelligence.

In [1]:
# Importing the patstat client
from epo.tipdata.patstat import PatstatClient

# Initialize the PATSTAT client
patstat = PatstatClient(env='PROD')

# Access ORM
db = patstat.orm()

# Importing tables as models
from epo.tipdata.patstat.database.models import TLS201_APPLN,TLS203_APPLN_ABSTR, TLS202_APPLN_TITLE, TLS209_APPLN_IPC, TLS224_APPLN_CPC, TLS231_INPADOC_LEGAL_EVENT, TLS211_PAT_PUBLN, TLS206_PERSON, TLS207_PERS_APPLN

# Ora trattiamo un caso reale

Step 1: Retrieve distinct docdb_family_id values corresponding to keywords

In [2]:
from sqlalchemy import and_, or_
# Keywords for the abstract and title
keywords = [
    "rare earth element*", "light REE*", "heavy REE*", "rare earth metal*",
    "rare earth oxide*", "lanthan*", "rare earth"
]
# Step 1: Retrieve distinct docdb_family_id values corresponding to keywords
subquery_keywords = (
    db.query(TLS201_APPLN.docdb_family_id)
    .join(TLS203_APPLN_ABSTR, TLS203_APPLN_ABSTR.appln_id == TLS201_APPLN.appln_id)
    .filter(or_(*[TLS203_APPLN_ABSTR.appln_abstract.contains(kw) for kw in keywords]))
    .union(
        db.query(TLS201_APPLN.docdb_family_id)
        .join(TLS202_APPLN_TITLE, TLS202_APPLN_TITLE.appln_id == TLS201_APPLN.appln_id)
        .filter(or_(*[TLS202_APPLN_TITLE.appln_title.contains(kw) for kw in keywords]))
    ).distinct()
).all()

# Convert result to a list of docdb_family_id values
docdb_family_ids_keywords = [row.docdb_family_id for row in subquery_keywords]
# Print the number of distinct docdb_family_id values
print(f"Number of distinct docdb_family_id values: {len(docdb_family_ids_keywords)}")

Number of distinct docdb_family_id values: 84905


Step 2: Retrieve distinct docdb_family_id values corresponding to classification codes

In [3]:
from sqlalchemy import func

# Classification codes with varying lengths

ipc_codes_11 = [
'A43B   1/12','B03B   9/06','B29B   7/66','B30B   9/32','B65D  65/46','C03B   1/02',
'C04B   7/24','C04B   7/26','C04B   7/28','C04B   7/30','C04B  11/26','C04B  18/04','C04B  18/06','C04B  18/08','C04B  18/10',
'C04B  18/12','C04B  18/14','C04B  18/16','C04B  18/18','C04B  18/20','C04B  18/22','C04B  18/24','C04B  18/26','C04B  18/28',
'C04B  18/30','C09K  11/01','C22B  19/28','C22B  19/30','C22B  25/06','D21B   1/08','D21B   1/10','D21B   1/32','D21C   5/02',
'D21H  17/01','H01B  15/00','H01J   9/52','H01M   6/52','H01M  10/54']

ipc_codes_8 = [
'B22F   8','B29B  17','B62D  67','B65H  73',
'C08J  11','C10M 175','C22B   7','D01G  11']

ipc_codes_12 = [
'C04B  33/132']

cpc_codes_11 = ['A43B   1/12','B03B   9/06','B29B   7/66','B30B   9/32','B65D  65/46','C03B   1/02',
'C04B   7/24','C04B   7/26','C04B   7/28','C04B   7/30','C04B  11/26','C04B  18/04','C04B  18/06','C04B  18/08','C04B  18/10',
'C04B  18/12','C04B  18/14','C04B  18/16','C04B  18/18','C04B  18/20','C04B  18/22','C04B  18/24','C04B  18/26','C04B  18/28',
'C04B  18/30','C09K  11/01','C22B  19/28','C22B  19/30','C22B  25/06','D21B   1/08','D21B   1/10','D21B   1/32','D21C   5/02',
'D21H  17/01','H01B  15/00','H01J   9/52','H01M   6/52','H01M  10/54','Y02W  30/50','Y02W  30/52','Y02W  30/56','Y02W  30/58',
'Y02W  30/60','Y02W  30/62','Y02W  30/64','Y02W  30/66','Y02W  30/74','Y02W  30/78','Y02W  30/80','Y02W  30/82','Y02W  30/84',
'Y02W  30/91','Y02P  10/20']

cpc_codes_8 = ['B22F   8','B29B  17','B62D  67','B65H  73',
'C08J  11','C10M 175','C22B   7','D01G  11']

cpc_codes_12 = ['C04B  18/068','C04B  33/132',
'C04B   7/243','C04B   7/246','C04B  18/049','C04B  18/061','C04B  18/062','C04B  18/064','C04B  18/065',
'C04B  18/067','C04B  18/081','C04B  18/082','C04B  18/084','C04B  18/085','C04B  18/087','C04B  18/088',
'C04B  18/101','C04B  18/103','C04B  18/105','C04B  18/106','C04B  18/108','C04B  18/125',
'C04B  18/141','C04B  18/142','C04B  18/143','C04B  18/144','C04B  18/145','C04B  18/146','C04B  18/147',
'C04B  18/148','C04B  18/149','C04B  18/162','C04B  18/165','C04B  18/167','C04B  18/241','C04B  18/243',
'C04B  18/245','C04B  18/246','C04B  18/248','C04B  18/265','C04B  18/305']


cpc_codes_13 = ['C04B  18/0409','C04B  18/0418',
'C04B  18/0427','C04B  18/0436','C04B  18/0445','C04B  18/0454','C04B  18/0463','C04B  18/0472','C04B  18/0481']





# Subquery for classification codes with varying lengths
subquery_classcodes = (
    db.query(TLS201_APPLN.docdb_family_id)
    .join(TLS209_APPLN_IPC, TLS209_APPLN_IPC.appln_id == TLS201_APPLN.appln_id)
    .filter(
        or_(
            func.substr(TLS209_APPLN_IPC.ipc_class_symbol, 1, 11).in_(ipc_codes_11),
            func.substr(TLS209_APPLN_IPC.ipc_class_symbol, 1, 8).in_(ipc_codes_8),
            func.substr(TLS209_APPLN_IPC.ipc_class_symbol, 1, 12).in_(ipc_codes_12)
        )
    )
    .union(
        db.query(TLS201_APPLN.docdb_family_id)
        .join(TLS224_APPLN_CPC, TLS224_APPLN_CPC.appln_id == TLS201_APPLN.appln_id)
        .filter(
            or_(
                func.substr(TLS224_APPLN_CPC.cpc_class_symbol, 1, 11).in_(cpc_codes_11),
                func.substr(TLS224_APPLN_CPC.cpc_class_symbol, 1, 8).in_(cpc_codes_8),
                func.substr(TLS224_APPLN_CPC.cpc_class_symbol, 1, 12).in_(cpc_codes_12),
                func.substr(TLS224_APPLN_CPC.cpc_class_symbol, 1, 12).in_(cpc_codes_13)
            )
        )
    ).distinct()
).all()

# Convert result to a list of docdb_family_id values
docdb_family_ids_classcodes = [row.docdb_family_id for row in subquery_classcodes]

# Print the number of results
print(f"Number of results in Step 2: {len(docdb_family_ids_classcodes)}")

Number of results in Step 2: 567012


Step 3: Get the intersection of the two lists

In [4]:
# Step 3: Get the intersection of the two lists
intersection_docdb_family_ids = list(set(docdb_family_ids_keywords) & set(docdb_family_ids_classcodes))

Step 4: Retrieve and display the list of results

In [5]:
# Step 4: Filter the data according to the timeframe 2010 - 2022
final_query = (
    db.query(func.count(func.distinct(TLS201_APPLN.docdb_family_id)).label('Famiglie'))
    .filter(TLS201_APPLN.docdb_family_id.in_(intersection_docdb_family_ids))
    .filter(TLS201_APPLN.earliest_filing_year.between(2010, 2022))
)

# Execute the query
result = final_query.all()
print(result)

[(4313,)]


In [6]:
import pandas as pd

# Step 4: Filter the data according to the timeframe 2010 - 2022 and include docdb_family_size, earliest_filing_year, and person_ctry_code
final_query = (
    db.query(
        TLS201_APPLN.docdb_family_id,
        TLS201_APPLN.docdb_family_size,
        TLS201_APPLN.earliest_filing_year,
        TLS206_PERSON.person_ctry_code
    )
    .join(TLS207_PERS_APPLN, TLS207_PERS_APPLN.appln_id == TLS201_APPLN.appln_id)
    .join(TLS206_PERSON, TLS206_PERSON.person_id == TLS207_PERS_APPLN.person_id)
    .filter(TLS201_APPLN.docdb_family_id.in_(intersection_docdb_family_ids))
    .filter(TLS201_APPLN.earliest_filing_year.between(2010, 2022))
    .distinct()
)

# Execute the query
result = final_query.all()

# Convert the result to a DataFrame
df = pd.DataFrame(result, columns=['docdb_family_id', 'docdb_family_size', 'earliest_filing_year', 'person_ctry_code'])

# Display the DataFrame
print(df)

# Save the DataFrame to a CSV file (optional)
df.to_excel('family_dim.xlsx', index=False)

      docdb_family_id  docdb_family_size  earliest_filing_year  \
0            59947557                  1                  2017   
1            62078381                  1                  2017   
2            82171931                  1                  2022   
3            86051107                  1                  2022   
4            84858887                  1                  2022   
...               ...                ...                   ...   
4679         59515233                  1                  2017   
4680         63565142                  1                  2018   
4681         83762024                  1                  2022   
4682         81173912                  1                  2021   
4683         62748784                 21                  2017   

     person_ctry_code  
0                      
1                      
2                      
3                      
4                      
...               ...  
4679                   
4680           

In [6]:
import pandas as pd

# Step 4: Filter the data according to the timeframe 2010 - 2022 and include docdb_family_size, earliest_filing_year, and person_ctry_code
final_query = (
    db.query(
        TLS201_APPLN.docdb_family_id,
        TLS201_APPLN.docdb_family_size,
        TLS201_APPLN.earliest_filing_year,
        TLS206_PERSON.person_ctry_code
    )
    .join(TLS207_PERS_APPLN, TLS207_PERS_APPLN.appln_id == TLS201_APPLN.appln_id)
    .join(TLS206_PERSON, TLS206_PERSON.person_id == TLS207_PERS_APPLN.person_id)
    .filter(TLS201_APPLN.docdb_family_id.in_(intersection_docdb_family_ids))
    .filter(TLS201_APPLN.earliest_filing_year.between(2010, 2022))
    .distinct()
)

# Execute the query
result = final_query.all()

# Convert the result to a DataFrame
df = pd.DataFrame(result, columns=['docdb_family_id', 'docdb_family_size', 'earliest_filing_year', 'person_ctry_code'])

# Calculate the average family size for each combination of person_ctry_code and earliest_filing_year
avg_family_size_df = df.groupby(['person_ctry_code', 'earliest_filing_year'])['docdb_family_size'].mean().reset_index()

# Rename the columns for clarity
avg_family_size_df.columns = ['person_ctry_code', 'earliest_filing_year', 'average_family_size']

# Display the new DataFrame
print(avg_family_size_df)

# Save the new DataFrame to a CSV file (optional)
avg_family_size_df.to_excel('average_family_sizes.xlsx', index=False)

    person_ctry_code  earliest_filing_year  average_family_size
0                                     2010             1.926829
1                                     2011             2.521008
2                                     2012             1.780142
3                                     2013             1.559524
4                                     2014             1.447761
..               ...                   ...                  ...
203               US                  2021             2.923077
204               US                  2022             1.958333
205               ZA                  2011             4.000000
206               ZA                  2016            12.000000
207               ZA                  2017            21.000000

[208 rows x 3 columns]


In [8]:
import plotly.express as px
import pandas as pd

# Creazione del grafico con modifica dell'asse Y
fig = px.bar(
    avg_family_size_df,
    y='person_ctry_code',
    x='average_family_size',
    color='earliest_filing_year',
    title='Average Family Size by Country and Filing Year',
    labels={'person_ctry_code': 'Country', 'average_family_size': 'Average Family Size'},
    barmode='stack',
    category_orders={'person_ctry_code': sorted_countries_list}
)

# Modifica della visualizzazione dell'asse y
fig.update_layout(
    height=1000,  # Aumento dell'altezza per evitare sovrapposizioni
    yaxis=dict(
        tickmode='linear',
        #tickangle=-45,  # Ruota le etichette per migliorarne la leggibilità
        automargin=True  # Assicura che le etichette abbiano spazio sufficiente
    )
)

# Salva il grafico come file HTML
fig.write_html("average_family_size_by_country.html")

# Mostra il grafico
fig.show()


NameError: name 'sorted_countries_list' is not defined