# Compute Shapley-Owen values

Compute Shapley-Owen values to decompose R2 among sets of regressors for patentsview and wos_2017.

# Preliminaries

In [1]:
# load some packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import config

In [2]:
# load the patent results (r2 values)
patents_df = pd.read_csv(config.DATA_PATH + "analytical/patents_shapley_data_raw.csv", 
                         low_memory=False)

In [3]:
# load the paper results (r2 values)
papers_df = pd.read_csv(config.DATA_PATH + "analytical/papers_shapley_data_raw.csv", 
                        low_memory=False)

# Shapley function

In [4]:
def get_shapley(predictors, r2_dict):
  """Compute shapley value."""
  results_dict = {}
  for predictor in predictors:
    predictor_contributions = {}  
    for predictor_subset, predictor_subset_rsq in r2_dict.items():
      predictor_subset_without_predictor = tuple([p for p in predictor_subset if p != predictor])
      if len(predictor_subset) > len(predictor_subset_without_predictor):
        predictor_subset_without_predictor_rsq = r2_dict[predictor_subset_without_predictor]
        predictor_contribution = predictor_subset_rsq - predictor_subset_without_predictor_rsq
        try:
          predictor_contributions[len(predictor_subset)].append(predictor_contribution)
        except KeyError:
          predictor_contributions[len(predictor_subset)] = [predictor_contribution]
    predictor_contributions = np.mean(list({k:np.mean(v) for k,v in predictor_contributions.items()}.values()))
    results_dict[predictor] = predictor_contributions
  return results_dict

# Patents

Start by cleaning up the data frame to allow for easier processing.

In [5]:
# remove empty rows
patents_df = patents_df.dropna()

In [6]:
# convert predictors column to tuples
patents_df = patents_df.assign(predictors=patents_df.predictors.str.split().apply(tuple))
patents_df

Unnamed: 0,predictors,rsq
0,"(nsubfield_id,)",0.014086
1,"(npubyear,)",0.23762
2,"(nauthor_id,)",0.243368
3,"(nsubfield_id, npubyear)",0.243979
4,"(nsubfield_id, nauthor_id)",0.24462
5,"(nauthor_id, npubyear)",0.337844
6,"(nsubfield_id, nauthor_id, npubyear)",0.338564


In [7]:
# add empty tuple
patents_df = pd.concat([patents_df,
                       pd.DataFrame([{"predictors": (), "rsq": 0.000000}])],)

In [8]:
# get predictors
predictors_patents = list(patents_df.predictors)
predictors_patents = list(set([i for s in predictors_patents for i in s]))

In [9]:
# get a dict representation of patents_df
patents_df_dict = {subset.predictors:subset.rsq for subset in patents_df.itertuples()}

In [10]:
# sanity check
assert len(patents_df_dict) == len(patents_df)

In [11]:
# get shapley value for patents
results_patents = get_shapley(predictors_patents, 
                              patents_df_dict)

# Papers

Start by cleaning up the data frame to allow for easier processing.

In [12]:
# remove empty rows
papers_df = papers_df.dropna()

In [13]:
# convert predictors column to tuples
papers_df = papers_df.assign(predictors=papers_df.predictors.str.split().apply(tuple))

In [14]:
# add empty tuple
papers_df = pd.concat([papers_df,
                      pd.DataFrame([{"predictors": (), "rsq": 0.000000}])])

In [15]:
# get predictors
predictors_papers = list(papers_df.predictors)
predictors_papers = list(set([i for s in predictors_papers for i in s]))

In [16]:
# get a dict representation of papers_df
papers_df_dict = {subset.predictors:subset.rsq for subset in papers_df.itertuples()}

In [17]:
# sanity check
assert len(papers_df_dict) == len(papers_df)

In [18]:
# get shapley value for papers
results_papers = get_shapley(predictors_papers, 
                             papers_df_dict)

# Plot results

In [19]:
# format results
results_df = []
results_df.extend([("Papers", predictor, shapley) for predictor, shapley in results_papers.items()])
results_df.extend([("Patents", predictor, shapley) for predictor, shapley in results_patents.items()])

In [20]:
# rename variables
PREDICTORS_DICT = {"nsubfield_id": "Field", 
                   "npubyear": "Year", 
                   "nauthor_id": "Author"}
results_df = [(r[0], PREDICTORS_DICT[r[1]], r[2]) for r in results_df]

In [21]:
# convert to data frame
results_df = pd.DataFrame(results_df, 
                          columns=["type", "predictor", "shapley"])

In [22]:
# reshape
results_df.pivot(index="type", 
                    columns="predictor", 
                    values="shapley").sort_values("type", 
                                                  ascending=False)

predictor,Author,Field,Year
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Patents,0.167777,0.006204,0.164583
Papers,0.200482,0.018218,0.015041


In [23]:
# save
results_df.to_csv(config.DATA_PATH + "analytical/shapley_decomposion_results.csv.gz",
                  index=False,
                  compression="gzip")