In [None]:
import pandas as pd
import zipfile

In [None]:
# open the zip file
with zipfile.ZipFile('g_cpc_current.tsv.zip', 'r') as zip_ref:
    # extract the .tsv file as a pandas dataframe
    with zip_ref.open('g_cpc_current.tsv') as file:
        df = pd.read_csv(file, sep='\t', header=0)

In [None]:
pcr_patents = df[df['cpc_group']=='C12Q1/686']

In [None]:
pcr_patent_ids = pcr_patents['patent_id'].tolist()

In [None]:
# open the zip file
with zipfile.ZipFile('g_us_patent_citation.tsv.zip', 'r') as zip_ref:
    # extract the .tsv file as a pandas dataframe
    with zip_ref.open('g_us_patent_citation.tsv') as file:
        citations = pd.read_csv(file, sep='\t', header=0)

In [None]:
citations['patent_id'] = citations['patent_id'].astype(str)
citations['citation_patent_id'] = citations['citation_patent_id'].astype(str)

In [None]:
# convert list to string list
pcr_patent_ids = [str(x) for x in pcr_patent_ids]

In [None]:
pcr_citations = citations[(citations['patent_id'].isin(pcr_patent_ids)) | (citations['citation_patent_id'].isin(pcr_patent_ids))]  

In [None]:
pcr_citations.to_csv('pcr_citations.tsv', sep='\t', index=False)

In [None]:
pcr_citations_between = pcr_citations[pcr_citations['patent_id'].isin(pcr_patent_ids) & pcr_citations['citation_patent_id'].isin(pcr_patent_ids)]

In [None]:
pcr_citations_between

In [None]:
pcr_patents.to_csv('pcr_patents.tsv', sep='\t', index=False)
pcr_citations_between.to_csv('pcr_citations_between.tsv', sep='\t', index=False)

In [None]:
import networkx as nx

In [None]:
pcr_patent_ids

In [None]:
G = nx.from_pandas_edgelist(pcr_citations_between, 'patent_id', 'citation_patent_id', create_using=nx.DiGraph)

In [None]:
# main path analysis
source_nodes = [n for n in G.nodes() if G.in_degree(n) == 0]
sink_nodes = [n for n in G.nodes() if G.out_degree(n) == 0]

In [None]:
weights = {}
for source in source_nodes:
    for sink in sink_nodes:
        try:
            # Find all paths between source and sink
            paths = list(nx.all_simple_paths(G, source, sink))
            
            # Update weights for each edge in each path
            for path in paths:
                for i in range(len(path)-1):
                    edge = (path[i], path[i+1])
                    weights[edge] = weights.get(edge, 0) + 1
        except nx.NetworkXNoPath:
            continue

In [None]:
import pickle

def save_weights(weights, filename):
    with open(filename + '.pkl', 'wb') as f:
        pickle.dump(weights, f)

def load_weights(filename):
    with open(filename + '.pkl', 'rb') as f:
        return pickle.load(f)

# Example usage:
# save_weights(weights, 'network_weights')
# weights = load_weights('network_weights')


In [None]:
weights = load_weights('network_weights')

In [None]:
weights

In [None]:
pcr_citations = pd.read_csv('pcr_citations.tsv', sep='\t')

In [None]:
pcr_citations

In [None]:
pcr_patents

In [None]:
pcr_patent_ids

In [None]:
patent_measures = pd.read_csv('patent_measure_combined.csv', index_col=0)

In [None]:
pcr_patents = patent_measures[patent_measures['patent_id'].isin(pcr_patent_ids)]

In [None]:
patents = pd.read_csv('g_patent.tsv', sep='\t')

In [None]:
pcr_patents_full = patents[patents['patent_id'].isin(pcr_patent_ids)]

In [None]:
pcr_patents_full.patent_date.max()

In [None]:
pcr_patents.patent_date.max()

In [None]:
pcr_patents.columns

In [None]:
pcr_patents[['grant_year','fcitALL']].corr()

In [None]:
weights

In [None]:
weights = load_weights('network_weights')

max_weight = max(weights.values())
weights = {k: v/max_weight for k, v in weights.items()}

# Find main path
main_path_edges = set()

# Transform weights for shortest path calculation
# (higher traversal weight = shorter path)
transformed_weights = {edge: 1 - weight 
                     for edge, weight in weights.items()}

# Find paths between all sources and sinks
for source in source_nodes:
    print(source)
    for sink in sink_nodes:
        try:
            path = nx.shortest_path(G, source, sink, 
                                  weight=lambda u, v, d: transformed_weights[(u, v)])
            
            # Add edges from path to main path
            for i in range(len(path)-1):
                main_path_edges.add((path[i], path[i+1]))
        except nx.NetworkXNoPath:
            continue

# Calculate path significance
if main_path_edges:
    path_significance = sum(weights[edge] for edge in main_path_edges) / len(main_path_edges)
else:
    path_significance = 0

with open('main_path_edges.pkl', 'wb') as f:
    pickle.dump(main_path_edges, f)

with open('weights_transformed.pkl','wb') as f:
    pickle.dump(weights, f)

with open('path_significance.pkl','wb') as f:
    pickle.dump(path_significance, f)


In [None]:
main_path_patents = []
for edge in main_path_edges:
    main_path_patents.append(edge[0])
    main_path_patents.append(edge[1])
main_path_patents = list(set(main_path_patents))
print(len(main_path_patents))

In [None]:
weights = load_weights('network_weights')

max_weight = max(weights.values())
weights = {k: v/max_weight for k, v in weights.items()}

# Find main path
main_path_edges = set()

# Transform weights for shortest path calculation
# (higher traversal weight = shorter path)
transformed_weights = {edge: - weight 
                     for edge, weight in weights.items()}

# Find paths between all sources and sinks
shortest_path_length = 10
for source in source_nodes:
    for sink in sink_nodes:
        try:
            path = nx.shortest_path(G, source, sink, weight=lambda u, v, d: transformed_weights[(u, v)])
            path_length = nx.shortest_path_length(G, source, sink, weight=lambda u, v, d: transformed_weights[(u, v)])
            if path_length < shortest_path_length:
                shortest_path = path
        except nx.NetworkXNoPath:
            continue

In [None]:
with open('main_path.pkl','wb') as f:
    pickle.dump(shortest_path, f)

In [None]:
shortest_path