### Loading Patent jsonl file

In [None]:
## install requred libraries
!pip install fastparquet

In [1]:
## Libraries
import glob

# - to extract gz files
import gzip
import json

# - to convert json to dataframe
import pandas as pd

In [52]:
# Load file
filename = "../data/raw/2021-us-patent.jsonl.gz"

## Open the gzip file and read the contents, apply decode to convert from bytes to string.
with gzip.open(filename, 'rb') as f:
    file_content = f.read().decode('ascii',  errors='ignore')

f.close()

In [53]:
content = file_content.splitlines()[9577]

In [None]:
content

In [None]:
patent = json.loads(content)

In [16]:
classifications_cpc = patent['biblio'].get('classifications_cpc')

In [34]:
dict = patent['biblio']['parties']['applicants'][0]
dict.get('residence')

In [38]:
dict.get('extracted_name', 'other')

{'value': 'HISEP TECH LTD'}

In [8]:
patents_data = []
patents_classifications = []
patents_applicants = []
patents_inventors = []

In [None]:
for content in file_content.splitlines():
    ## load content as a patent
    patent = json.loads(content)
    data = {
        'lens_id': patent['lens_id'],
        'jurisdiction': patent['jurisdiction'],
        'patent_id': patent['doc_key'],
        'date_published': patent['date_published'],
        'title': patent['biblio']['invention_title'][0]['text'],
        'abstract': patent['abstract'][0]['text']
    }
    patents_data.append(data)

    for applicant in patent['biblio']['parties']['applicants']:
        app_data = {
            'lens_id': patent['lens_id'],
            'patent_id': patent['doc_key'],
            'residence': applicant.get('residence', 'NA'),
            'name': applicant['extracted_name']['value']
        }
        patents_applicants.append(app_data)
    
    for inventor in patent['biblio']['parties']['inventors']:
        inv_data = {
            'lens_id': patent['lens_id'],
            'patent_id': patent['doc_key'],
            'residence': inventor.get('residence', 'NA'),
            'name': inventor['extracted_name']['value']
        }
        patents_inventors.append(inv_data)

    for classification in patent['biblio']['classifications_cpc']['classifications']:
        class_data = {
            'lens_id': patent['lens_id'],
            'patent_id': patent['doc_key'],
            'classification': classification['symbol']
        }
        patents_classifications.append(class_data)

del(content)    ## clear variable from memory

In [11]:
df = pd.DataFrame(patents_data)

In [14]:
df.to_parquet('test.parquet')

In [55]:
def process_file(file_content):

    for content in file_content.splitlines():
        ## load content as a patent
        patent = json.loads(content)

        data = {
            'lens_id': patent['lens_id'],
            'jurisdiction': patent['jurisdiction'],
            'patent_id': patent['doc_key'],
            'date_published': patent['date_published'],
            'title': patent['biblio']['invention_title'][0]['text'],
            'abstract': patent['abstract'][0]['text']
        }
        patents_data.append(data)

        for applicant in patent['biblio']['parties']['applicants']:
            app_data = {
                'lens_id': patent['lens_id'],
                'patent_id': patent['doc_key'],
                'residence': applicant.get('residence', 'NA'),
                'name': applicant['extracted_name']['value']
            }
            patents_applicants.append(app_data)
        
        for inventor in patent['biblio']['parties']['inventors']:
            inv_data = {
                'lens_id': patent['lens_id'],
                'patent_id': patent['doc_key'],
                'residence': inventor.get('residence', 'NA'),
                'name': inventor['extracted_name']['value']
            }
            patents_inventors.append(inv_data)

        #print('process classifications')
        classifications_cpc = patent['biblio'].get('classifications_cpc')
        if classifications_cpc is not None:
            for classification in classifications_cpc['classifications']:
                #print('process classification')
                class_data = {
                    'lens_id': patent['lens_id'],
                    'patent_id': patent['doc_key'],
                    'classification': classification['symbol']
                }
                patents_classifications.append(class_data)

    del(content)    ## clear variable from memory
   

In [56]:
def process_gzip(filename):
    ## Open the gzip file and read the contents, apply decode to convert from bytes to string.
    with gzip.open(filename, 'rb') as f:
        file_content = f.read().decode('ascii',  errors='ignore')       ##handling special characters: ignore, skip characters.
        process_file(file_content)
    f.close()

In [None]:
path = r'..\data\raw\*.gz'
files = glob.glob(path)

## init dicts:
patents_data = []
patents_classifications = []
patents_applicants = []
patents_inventors = []


for f in files:
    print("processing: ", f)
    process_gzip(f)
    

In [38]:
# Load file
filename = "../data/raw/poor_text"

## Open the gzip file and read the contents, apply decode to convert from bytes to string.
with open(filename, 'rb') as f:
    file_content = f.read().decode('utf8')

f.close()

In [40]:
test = json.loads(file_content)

In [41]:
test

{'test': 'SIGGRAPH 99, ACM �\x85 - NEW YORK,'}

## Journal parsing

In [54]:
filename = '../data/raw/journals/journals_2020-10-10_to_2020-10-20_from_0.json'
json_data = open(filename)
 
data = json.load(json_data)

In [55]:
content = data['data']

In [None]:
content

In [69]:
df = pd.DataFrame.from_dict(content)

In [57]:
df.columns

Index(['lens_id', 'title', 'date_published', 'authors', 'fields_of_study',
       'keywords', 'references', 'abstract', 'funding'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,lens_id,title,date_published,authors,fields_of_study,keywords,references,abstract,funding
0,001-026-600-745-033,Impact of the local care environment and socia...,2020-10-10T00:00:00.000000+00:00,"[{'first_name': 'Jean-David', 'last_name': 'Ze...","[Health care, Mortality rate, Case fatality ra...","[Aggregated fatality rate, COVID-19, Care envi...","[{'lens_id': '007-453-228-852-354'}, {'lens_id...",Abstract Objectives We aimed to investigate po...,
1,000-448-627-934-890,Public health prevention and emergency prepare...,2020-10-10T00:00:00.000000+00:00,"[{'first_name': 'Brendon', 'last_name': 'Sen-C...","[Public health, Preparedness, Healthcare syste...","[Center for disease, Control and prevention, E...",,•The CDC's cumulative funding for Public Healt...,"[{'org': 'CDC'}, {'org': 'Department of Agricu..."
2,001-381-066-203-176,Early longitudinal community pharmacy placemen...,2020-10-10T00:00:00.000000+00:00,"[{'first_name': 'Aisling', 'last_name': 'Kerr'...","[Pharmacy, Psychology, Response rate (survey),...","[Curriculum integration, Experiential learning...","[{'lens_id': '000-117-756-475-399'}, {'lens_id...",Abstract Background Longitudinal placements ar...,[{'org': 'RCSI'}]
3,005-816-533-307-691,Coccoloba alnifolia Leaf Extract as a Potentia...,2020-10-10T00:00:00.000000+00:00,"[{'first_name': 'Luciana Fentanes Moura', 'las...","[Chemistry, In vitro, In vivo, Phenols, Antiox...",,"[{'lens_id': '000-323-618-510-827'}, {'lens_id...",The genus Coccoloba is widely used in traditio...,"[{'org': 'Ministério da Ciência, Tecnologia e ..."
4,005-824-664-490-748,Mining miRNAs' Expressions in Glioma Based on ...,2020-10-10T00:00:00.000000+00:00,"[{'first_name': 'Ke', 'last_name': 'Li', 'init...","[microRNA, Apoptosis, Glioma, MTT assay, Cance...",,"[{'lens_id': '001-262-738-141-077'}, {'lens_id...",Purpose. To mine miR expression in glioma base...,


In [7]:
df.to_csv("/Users/kritika/Desktop/MDSI/iLab2/journal.csv")


In [70]:
# Convert and format the date_published column
df['date_published'] = pd.to_datetime(df['date_published']).dt.strftime('%y-%m-%d')

In [59]:
df['date_published']

0     20-10-10
1     20-10-10
2     20-10-10
3     20-10-10
4     20-10-10
        ...   
95    20-10-10
96    20-10-10
97    20-10-10
98    20-10-10
99    20-10-10
Name: date_published, Length: 100, dtype: object

In [None]:
# Define a function to extract and format the author name
def extract_author_info(row):
    authors = row['authors']
    
    if isinstance(authors, list) and len(authors) > 0:
        last_author = authors[-1]
        first_name = last_author.get('first_name', 'NA')
        last_name = last_author.get('last_name', 'NA')
        affiliations = last_author.get('affiliations', [])
        
        if affiliations:
            institution = affiliations[0].get('name', 'NA')
            country_code = affiliations[0].get('country_code', 'NA')
            return f"{first_name} {last_name}", institution, country_code
        
    return '', '', ''

# Apply the function to create the "author", "institution", and "country" columns
df[['author', 'institution', 'country']] = df.apply(extract_author_info, axis=1).apply(pd.Series)

# drop authors column
df.drop(['authors'], axis=1, inplace=True)

# Convert the list of strings into a single string, separated by commas
df['fields_of_study'] = df['fields_of_study'].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')

In [77]:
df.head()

Unnamed: 0,lens_id,title,date_published,fields_of_study,keywords,references,abstract,funding,author,institution,country
0,001-026-600-745-033,Impact of the local care environment and socia...,20-10-10,"Health care, Mortality rate, Case fatality rat...","[Aggregated fatality rate, COVID-19, Care envi...","[{'lens_id': '007-453-228-852-354'}, {'lens_id...",Abstract Objectives We aimed to investigate po...,,Jérémie H. Lefevre,University of Paris,FR
1,000-448-627-934-890,Public health prevention and emergency prepare...,20-10-10,"Public health, Preparedness, Healthcare system...","[Center for disease, Control and prevention, E...",,•The CDC's cumulative funding for Public Healt...,"[{'org': 'CDC'}, {'org': 'Department of Agricu...",Adel Elkbuli,"Department of Surgery, Division of Trauma and ...",
2,001-381-066-203-176,Early longitudinal community pharmacy placemen...,20-10-10,"Pharmacy, Psychology, Response rate (survey), ...","[Curriculum integration, Experiential learning...","[{'lens_id': '000-117-756-475-399'}, {'lens_id...",Abstract Background Longitudinal placements ar...,[{'org': 'RCSI'}],Judith Strawbridge,Royal College of Surgeons in Ireland,IE
3,005-816-533-307-691,Coccoloba alnifolia Leaf Extract as a Potentia...,20-10-10,"Chemistry, In vitro, In vivo, Phenols, Antioxi...",,"[{'lens_id': '000-323-618-510-827'}, {'lens_id...",The genus Coccoloba is widely used in traditio...,"[{'org': 'Ministério da Ciência, Tecnologia e ...",Katia Castanho Scortecci,Federal University of Rio Grande do Norte,BR
4,005-824-664-490-748,Mining miRNAs' Expressions in Glioma Based on ...,20-10-10,"microRNA, Apoptosis, Glioma, MTT assay, Cancer...",,"[{'lens_id': '001-262-738-141-077'}, {'lens_id...",Purpose. To mine miR expression in glioma base...,,Hailong Xing,Binzhou University,CN


### Mapping fields of study to tech groups

In [33]:
# Concatenate all the lists of fields, ignoring float values
all_fields = []
for fields_list in df['fields_of_study']:
    if isinstance(fields_list, list):
        all_fields.extend(fields_list)

# Convert the concatenated list into a set to get unique values
unique_fields = set(all_fields)

# Print the unique fields
print(unique_fields)

{'Extension (predicate logic)', 'Flutter', 'Astronomical interferometer', 'Identification (biology)', 'Product (mathematics)', 'Family medicine', 'Causality', 'Groundwater', 'Residual', 'Lime Juice', 'Natural gas field', 'Cell biology', 'Wearable computer', 'Neurostimulation', 'Cardiology', 'Lung involvement', 'Index (economics)', 'Geriatrics', 'Anxiety', 'Test (assessment)', 'Small data', 'Central nervous system', 'Capital (economics)', 'Benign Esophageal Neoplasm', 'Oxidative stress', 'Diffusely Adherent Escherichia coli', 'Metronomic Chemotherapy', 'Safe haven', 'Sociological imagination', 'Vasodilation', 'Homosexuality', 'Mood', 'Telemedicine', 'Cavitation', 'Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)', 'In vitro', 'GNSS applications', 'Rapeseed', 'Agreement', 'Distributed lag', 'Greenhouse gas', 'Urea', 'Dissipative system', 'Acute care', 'Consumption (economics)', 'Esophagectomy', 'Signal transduction', 'Superoxide dismutase', 'Applied mathematics', 'Intraocular

In [37]:
import regex as re
# Define the regular expressions and corresponding technology clusters
technology_clusters = {
    r'(Health|Medicine|Clinical|Pharmaceutical)': 'Medical and Healthcare',
    r'(Environment|Sustainability|Ecology|Conservation|Climate|Environmental)': 'Environmental Science and Sustainability',
    r'(Technology|Engineering|Computer|Software|Hardware|Information|Data|Artificial Intelligence)': 'Technology and Engineering',
    r'(Economics|Economic|Finance|Business|Management|Marketing|Sociology|Policy)': 'Economics and Social Sciences',
    r'(Physics|Mathematics|Math|Statistics|Quantitative|Algebra|Calculus)': 'Physics and Mathematics'
}

# Function to map fields to technology clusters
def map_technology(fields_list):
    if isinstance(fields_list, list):
        for field in fields_list: 
            for pattern, cluster in technology_clusters.items():
                if pd.notna(field) and re.search(pattern, field):
                    return cluster
    return 'Miscellaneous'  # If no match is found, you can assign it to an "Miscellaneous" category or handle it as needed

# Apply the mapping function to create the "technology" column
df['technology'] = df['fields_of_study'].apply(map_technology)

# Display the resulting DataFrame
print(df['technology'])


0            Medical and Healthcare
1            Medical and Healthcare
2                     Miscellaneous
3                     Miscellaneous
4                     Miscellaneous
                  ...              
95    Economics and Social Sciences
96                    Miscellaneous
97          Physics and Mathematics
98          Physics and Mathematics
99       Technology and Engineering
Name: technology, Length: 100, dtype: object


In [38]:
df['technology'].value_counts()

Medical and Healthcare                      41
Miscellaneous                               28
Economics and Social Sciences               13
Physics and Mathematics                      9
Technology and Engineering                   5
Environmental Science and Sustainability     4
Name: technology, dtype: int64

In [45]:
all_texts = list(technology_clusters.values()) + list(unique_fields)

In [50]:
fields_of_study = list(unique_fields)

technology_clusters = {
    0: 'Medical and Healthcare',
    1: 'Environmental Science and Sustainability',
    2: 'Technology and Engineering',
    3: 'Economics and Social Sciences',
    4: 'Physics and Mathematics'
}


In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the vectorizer on all_texts
tfidf_matrix = vectorizer.fit_transform(all_texts)

# Calculate cosine similarities between fields of study and cluster descriptions
similarities = cosine_similarity(tfidf_matrix[len(technology_clusters):], tfidf_matrix[:len(technology_clusters)])

# Find the most similar cluster for each field of study
field_to_cluster_mapping = {}
for i, field in enumerate(fields_of_study):
    most_similar_cluster = similarities[i].argmax()
    field_to_cluster_mapping[field] = technology_clusters[most_similar_cluster]

# Print the mapping of fields to clusters
for field, cluster in field_to_cluster_mapping.items():
    print(f"{field} -> {cluster}")

Extension (predicate logic) -> Medical and Healthcare
Flutter -> Medical and Healthcare
Astronomical interferometer -> Medical and Healthcare
Identification (biology) -> Medical and Healthcare
Product (mathematics) -> Physics and Mathematics
Family medicine -> Medical and Healthcare
Causality -> Medical and Healthcare
Groundwater -> Medical and Healthcare
Residual -> Medical and Healthcare
Lime Juice -> Medical and Healthcare
Natural gas field -> Medical and Healthcare
Cell biology -> Medical and Healthcare
Wearable computer -> Medical and Healthcare
Neurostimulation -> Medical and Healthcare
Cardiology -> Medical and Healthcare
Lung involvement -> Medical and Healthcare
Index (economics) -> Economics and Social Sciences
Geriatrics -> Medical and Healthcare
Anxiety -> Medical and Healthcare
Test (assessment) -> Medical and Healthcare
Small data -> Medical and Healthcare
Central nervous system -> Medical and Healthcare
Capital (economics) -> Economics and Social Sciences
Benign Esophage