In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import json
import re

In [2]:
import utils

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/phaxssi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/phaxssi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
patents_info = pd.read_excel('graphs/total_patents_df.xlsx')
patents_info = patents_info.drop(columns=['Unnamed: 0'])

# 1. Most cited

- Inventors
- Companies (Applicant Name)
- Countries
- Top 20 most cited

In [4]:
patents_most_cited = patents_info.sort_values(by='input_degree', ascending=False)

### 1.1. Countries

In [6]:
patents_most_cited_countries = patents_most_cited.copy()
patents_most_cited_countries['country'] = patents_most_cited_countries.apply(utils.fill_country_from_patent_number, axis=1)
countries_data = patents_most_cited_countries.explode('country')
countries_citations = countries_data.groupby('country').agg({'input_degree': 'sum'}).reset_index()
top_countries = countries_citations.sort_values(by='input_degree', ascending=False).head(25)
top_countries_dict = top_countries.to_dict(orient='records')
with open ('rankings/top_countries_cited.json', 'w') as f:
    json.dump(top_countries_dict, f, indent=4)

### 1.2. Inventors

In [8]:
patents_most_cited_inventors = patents_most_cited.copy()
patents_most_cited_inventors['inventor_names'] = patents_most_cited['inventor_names'].apply(utils.clean_and_deduplicate_names)
df_exploded = patents_most_cited_inventors.explode('inventor_names')
df_exploded = df_exploded.assign(inventor_names=df_exploded['inventor_names'].str.split(',')).explode('inventor_names')
df_exploded['inventor_names'] = df_exploded['inventor_names'].str.strip()
df_exploded = df_exploded[df_exploded['inventor_names'] != 'Unavailable information']
inventor_citations = df_exploded.groupby('inventor_names')['input_degree'].sum()
top_inventors = inventor_citations.sort_values(ascending=False).head(20)
top_inventors_dict = top_inventors.to_dict()
with open ('rankings/top_inventors_cited.json', 'w') as f:
    json.dump(top_inventors_dict, f, indent=4)

### 1.3. Applicants

In [12]:
patents_most_cited_applicants = patents_most_cited.copy()
patents_most_cited_applicants['applicant_names'] = patents_most_cited['applicant_names'].apply(utils.clean_and_deduplicate_names)
df_exploded = patents_most_cited_applicants.explode('applicant_names')
df_exploded = df_exploded.assign(applicant_names=df_exploded['applicant_names'].str.split(',')).explode('applicant_names')
df_exploded['applicant_names'] = df_exploded['applicant_names'].str.strip()
df_exploded = df_exploded[df_exploded['applicant_names'] != 'Unavailable information']
applicant_citations = df_exploded.groupby('applicant_names')['input_degree'].sum()
top_applicants = applicant_citations.sort_values(ascending=False).head(20)
top_applicants_dict = top_applicants.to_dict()
with open ('rankings/top_applicants_cited.json', 'w') as f:
    json.dump(top_applicants_dict, f, indent=4)

# 2. Most citing

In [7]:
patents_most_citing = patents_info.sort_values(by='input_degree', ascending=False)

### 2.1. Countries

In [8]:
patents_most_cited_countries = patents_most_citing.copy()
patents_most_cited_countries['country'] = patents_most_cited_countries.apply(utils.fill_country_from_patent_number, axis=1)
countries_data = patents_most_cited_countries.explode('country')
countries_citations = countries_data.groupby('country').agg({'output_degree': 'sum'}).reset_index()
top_countries = countries_citations.sort_values(by='output_degree', ascending=False).head(25)
top_countries_dict = top_countries.to_dict(orient='records')
with open ('rankings/top_countries_citing.json', 'w') as f:
    json.dump(top_countries_dict, f, indent=4)

### 2.2. Inventors

In [11]:
patents_most_citing_inventors = patents_most_citing.copy()
patents_most_citing_inventors['inventor_names'] = patents_most_citing_inventors['inventor_names'].apply(utils.clean_and_deduplicate_names)
df_exploded = patents_most_citing_inventors.explode('inventor_names')
df_exploded = df_exploded.assign(inventor_names=df_exploded['inventor_names'].str.split(',')).explode('inventor_names')
df_exploded['inventor_names'] = df_exploded['inventor_names'].str.strip()
df_exploded = df_exploded[df_exploded['inventor_names'] != 'Unavailable information']
inventor_citations = df_exploded.groupby('inventor_names')['output_degree'].sum()
top_inventors = inventor_citations.sort_values(ascending=False).head(20)
top_inventors_dict = top_inventors.to_dict()
with open ('rankings/top_inventors_citing.json', 'w') as f:
    json.dump(top_inventors_dict, f, indent=4, ensure_ascii=False)

### 2.3. Applicants

In [13]:
patents_most_citing_applicants = patents_most_citing.copy()
patents_most_citing_applicants['applicant_names'] = patents_most_citing_applicants['applicant_names'].apply(utils.clean_and_deduplicate_names)
df_exploded = patents_most_citing_applicants.explode('applicant_names')
df_exploded = df_exploded.assign(applicant_names=df_exploded['applicant_names'].str.split(',')).explode('applicant_names')  
df_exploded['applicant_names'] = df_exploded['applicant_names'].str.strip()
df_exploded = df_exploded[df_exploded['applicant_names'] != 'Unavailable information']
applicant_citations = df_exploded.groupby('applicant_names')['output_degree'].sum()
top_applicants = applicant_citations.sort_values(ascending=False).head(20)
top_applicants_dict = top_applicants.to_dict()
with open ('rankings/top_applicants_citing.json', 'w') as f:
    json.dump(top_applicants_dict, f, indent=4, ensure_ascii=False)