In [None]:
# Install the required libraries if you haven't already
!pip install torch transformers



In [4]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:


def tokenize_text_data(text, tokenizer = False):
  tokenized_text = tokenizer(text, return_tensors="pt")['input_ids']
  return tokenized_text


def fetch_embeddings_for_text_data(tokenized_text, model = False):
  with torch.no_grad():
    embeddings = model(tokenized_text)['last_hidden_state'].mean(dim=1).numpy()
  return embeddings


def calculate_cosine_similarity(text_embedding_one, text_embedding_two):
  return cosine_similarity(text_embedding_one, text_embedding_two)[0][0]



def orchestrate(df, col1, col2):

  df[col1] = df[col1].astype(str)
  df[col2] = df[col2].astype(str)
  model_name = "bert-base-uncased"  # You can change the model as needed
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModel.from_pretrained(model_name)

  tokenizer_lambda = lambda x, y = tokenizer: tokenize_text_data(x, y)
  col1_tokenized = df[col1].apply(tokenizer_lambda)
  col2_tokenized = df[col2].apply(tokenizer_lambda)

  embeddings_lambda = lambda x, y = model: fetch_embeddings_for_text_data(x, y)
  col1_tokenized_embeddings = list(col1_tokenized.apply(embeddings_lambda))
  col2_tokenized_embeddings = list(col2_tokenized.apply(embeddings_lambda))

  semantic_similarities = [calculate_cosine_similarity(col1_tokenized_embeddings[i], col2_tokenized_embeddings[i]) for i in range(len(col1_tokenized_embeddings))]
  return semantic_similarities


def correlate(df, col1, col2, name = False):
  if name:
    df = df[df['Member'] == name]
  other_df = orchestrate(df[['Member', 'disclosure_date', 'transaction_date', col1, col2]], col1, col2)
  other_df = pd.Series(other_df).to_frame().reset_index()
  df = df.reset_index().merge(other_df, how = 'inner', on = 'index')
  return df.drop_duplicates()



# Transactions Link to Committees and Subcommittees

In [5]:
transactions = pd.read_csv('stacked_df.csv')
committees = pd.read_csv('committees.csv')
membership = pd.read_csv('membership.csv')
transactions.head()

  transactions = pd.read_csv('stacked_df.csv')


Unnamed: 0,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,member_x,district,state,...,HSSM22,SSCM33,SSCM34,SSCM35,SSCM36,SSCM37,SSCM38,SSCM39,SSJU27,SSJU28
0,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",susie lee,NV03,NV,...,,,,,,,,,,
1,2020-04-28,2020-03-06,joint,BAH,Booz Allen Hamilton Holding Corporation,sale_full,"$1,001 - $15,000",susie lee,NV03,NV,...,,,,,,,,,,
2,2020-04-28,2020-03-23,joint,BYD,Boyd Gaming Corporation,sale_full,"$1,001 - $15,000",susie lee,NV03,NV,...,,,,,,,,,,
3,2020-04-28,2020-02-04,joint,CTLT,"Catalent, Inc.",purchase,"$1,001 - $15,000",susie lee,NV03,NV,...,,,,,,,,,,
4,2020-04-28,2020-03-18,joint,CBRE,CBRE Group Inc Common Stock Class A,sale_partial,"$1,001 - $15,000",susie lee,NV03,NV,...,,,,,,,,,,


In [6]:
transactions_cols_list = ['member_x', 'disclosure_date', 'transaction_date', 'owner', 'ticker', 'asset_description', 'type', 'amount', 'industry', 'sector']
transactions = transactions[transactions_cols_list]
transactions = transactions.rename(columns = {'member_x': 'Member'})
transactions

Unnamed: 0,Member,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,industry,sector
0,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care
1,susie lee,2020-04-28,2020-03-06,joint,BAH,Booz Allen Hamilton Holding Corporation,sale_full,"$1,001 - $15,000",Professional Services,Consumer Services
2,susie lee,2020-04-28,2020-03-23,joint,BYD,Boyd Gaming Corporation,sale_full,"$1,001 - $15,000",Movies/Entertainment,Consumer Discretionary
3,susie lee,2020-04-28,2020-02-04,joint,CTLT,"Catalent, Inc.",purchase,"$1,001 - $15,000",Biotechnology: Pharmaceutical Preparations,Health Care
4,susie lee,2020-04-28,2020-03-18,joint,CBRE,CBRE Group Inc Common Stock Class A,sale_partial,"$1,001 - $15,000",Real Estate,Finance
...,...,...,...,...,...,...,...,...,...,...
15679,lois frankel,2023-04-28,2023-04-05,,CI,The Cigna Group,purchase,"$1,001 - $15,000",Medical Specialities,Health Care
15680,lois frankel,2023-04-28,2023-03-16,,ULTA,Ulta Beauty inc,sale_partial,"$1,001 - $15,000",,
15681,earl blumenauer,2023-03-09,2023-02-15,self,NWN,Northwest Natural Holding Company,purchase,"$1,001 - $15,000",Oil/Gas Transmission,Public Utilities
15682,dwight evans,2023-03-09,2023-02-28,,CSX,CSX Corporation,sale_full,"$1,001 - $15,000",Railroads,Transportation


In [7]:
committees.head()

Unnamed: 0,Committee,Comittee_code,Subcommittee,Subcommittee_code
0,House Committee on Agriculture,HSAG,Forestry,HSAG15
1,House Committee on Agriculture,HSAG,"Commodity Markets, Digital Assets, and Rural D...",HSAG22
2,House Committee on Agriculture,HSAG,"General Farm Commodities, Risk Management, and...",HSAG16
3,House Committee on Agriculture,HSAG,"Livestock, Dairy, and Poultry",HSAG29
4,House Committee on Agriculture,HSAG,"Conservation, Research, and Biotechnology",HSAG14


In [8]:
membership['Member_name'] = membership['Member_name'].str.lower()
membership.head()

Unnamed: 0,Comittee_code,Member_name
0,HSII,bruce westerman
1,HSII,raúl m. grijalva
2,HSII,doug lamborn
3,HSII,grace f. napolitano
4,HSII,robert j. wittman


In [None]:

transactions_link_committees = transactions.merge(membership, how = 'inner', left_on = 'Member', right_on = 'Member_name')
transactions_link_committees = transactions_link_committees.merge(committees, how = 'inner', on = 'Comittee_code')
transactions_link_committees.head()

Unnamed: 0,Member,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,industry,sector,Comittee_code,Member_name,Committee,Subcommittee,Subcommittee_code
0,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06
1,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Federal Lands,HSII10
2,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,"Water, Wildlife and Fisheries",HSII13
3,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Indian and Insular Affairs,HSII24
4,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Oversight and Investigations,HSII15


## Transformations to obtain company, committee, and subcomittee descriptions to enhance the semantic contextual correlation capacity of the BERT transformer model

In [None]:
### Obtain all distinct asset descriptions

asset_descriptions = transactions_link_committees['asset_description'].value_counts().to_frame().reset_index()[['index']].drop_duplicates()
asset_descriptions.to_csv('asset_descriptions.csv')

In [None]:
### Obtain all distinct committees

commmittees = transactions_link_committees['Committee'].value_counts().to_frame().reset_index()[['index']].drop_duplicates()
commmittees.to_csv('committee.csv')

In [None]:
### Obtain all distinct subcommittees

subcommittees = transactions_link_committees['Subcommittee'].value_counts().to_frame().reset_index()[['index']].drop_duplicates()
subcommittees.to_csv('subcommittee.csv')

In [None]:
### Obtain all distinct industries

industries = transactions_link_committees['industry'].value_counts().to_frame().reset_index()[['index']].drop_duplicates()
industries.to_csv('industry.csv')

In [None]:
### Merge committees

committees_enriched = pd.read_csv('comittee_enriched.csv')
transactions_link_committees = transactions_link_committees.merge(committees_enriched, how = 'inner', on = 'Committee')
transactions_link_committees = transactions_link_committees.rename(columns = {
    'Descriptions': 'committee_description'
})
transactions_link_committees = transactions_link_committees.drop(columns = ['Unnamed: 0'])
transactions_link_committees.head()

Unnamed: 0,Member,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,industry,sector,Comittee_code,Member_name,Committee,Subcommittee,Subcommittee_code,committee_description
0,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min..."
1,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Federal Lands,HSII10,"Manages federal lands, oversees energy and min..."
2,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,"Water, Wildlife and Fisheries",HSII13,"Manages federal lands, oversees energy and min..."
3,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Indian and Insular Affairs,HSII24,"Manages federal lands, oversees energy and min..."
4,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Oversight and Investigations,HSII15,"Manages federal lands, oversees energy and min..."


In [None]:
### Merge subcommittees
subcommittees_enriched = pd.read_csv('subcommittee_enriched.csv')
transactions_link_committees = transactions_link_committees.merge(subcommittees_enriched, how = 'inner', left_on = 'Subcommittee', right_on = 'index')
transactions_link_committees = transactions_link_committees.rename(columns = {
    'Description': 'subcommittee_description'
})
transactions_link_committees = transactions_link_committees.drop(columns = ['Unnamed: 0', 'index'])
transactions_link_committees.head()

Unnamed: 0,Member,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,industry,sector,Comittee_code,Member_name,Committee,Subcommittee,Subcommittee_code,committee_description,subcommittee_description
0,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...
1,susie lee,2020-04-28,2020-03-06,joint,BAH,Booz Allen Hamilton Holding Corporation,sale_full,"$1,001 - $15,000",Professional Services,Consumer Services,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...
2,susie lee,2020-04-28,2020-03-23,joint,BYD,Boyd Gaming Corporation,sale_full,"$1,001 - $15,000",Movies/Entertainment,Consumer Discretionary,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...
3,susie lee,2020-04-28,2020-02-04,joint,CTLT,"Catalent, Inc.",purchase,"$1,001 - $15,000",Biotechnology: Pharmaceutical Preparations,Health Care,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...
4,susie lee,2020-04-28,2020-03-18,joint,CBRE,CBRE Group Inc Common Stock Class A,sale_partial,"$1,001 - $15,000",Real Estate,Finance,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...


In [None]:
### Merge industries
industry_enriched = pd.read_csv('industry_enriched.csv')
transactions_link_committees = transactions_link_committees.merge(industry_enriched, how = 'inner', left_on = 'industry', right_on = 'Industry')
transactions_link_committees = transactions_link_committees.drop(columns = ['Industry'])
transactions_link_committees = transactions_link_committees.rename(columns = {
    list(transactions_link_committees.columns)[-1]: 'industry_description'
})
transactions_link_committees.head()

Unnamed: 0,Member,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,industry,sector,Comittee_code,Member_name,Committee,Subcommittee,Subcommittee_code,committee_description,subcommittee_description,industry_description
0,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...,Companies involved in the production of pharma...
1,susie lee,2020-02-25,2020-01-06,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...,Companies involved in the production of pharma...
2,susie lee,2020-10-07,2020-08-27,joint,BHC,Bausch Health Companies Inc.,sale_full,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...,Companies involved in the production of pharma...
3,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Federal Lands,HSII10,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...
4,susie lee,2020-02-25,2020-01-06,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Federal Lands,HSII10,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...


# Now, correlate the Industry with the Committees

In [None]:
industry_plus_committees = correlate(transactions_link_committees.head(20), 'industry_description', 'committee_description')
industry_plus_committees

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col1] = df[col1].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col2] = df[col2].astype(str)


Unnamed: 0,index,Member,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,industry,sector,Comittee_code,Member_name,Committee,Subcommittee,Subcommittee_code,committee_description,subcommittee_description,industry_description,0
0,0,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...,Companies involved in the production of pharma...,0.660035
1,1,susie lee,2020-02-25,2020-01-06,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...,Companies involved in the production of pharma...,0.660035
2,2,susie lee,2020-10-07,2020-08-27,joint,BHC,Bausch Health Companies Inc.,sale_full,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...,Companies involved in the production of pharma...,0.660035
3,3,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Federal Lands,HSII10,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...,0.660035
4,4,susie lee,2020-02-25,2020-01-06,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Federal Lands,HSII10,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...,0.660035
5,5,susie lee,2020-10-07,2020-08-27,joint,BHC,Bausch Health Companies Inc.,sale_full,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Federal Lands,HSII10,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...,0.660035
6,6,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,"Water, Wildlife and Fisheries",HSII13,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...,0.660035
7,7,susie lee,2020-02-25,2020-01-06,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,"Water, Wildlife and Fisheries",HSII13,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...,0.660035
8,8,susie lee,2020-10-07,2020-08-27,joint,BHC,Bausch Health Companies Inc.,sale_full,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,"Water, Wildlife and Fisheries",HSII13,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...,0.660035
9,9,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Indian and Insular Affairs,HSII24,"Manages federal lands, oversees energy and min...",Focuses on legislative matters pertaining to i...,Companies involved in the production of pharma...,0.660035


In [None]:
industry_plus_committees = industry_plus_committees.rename(columns = {0: 'BERT Transformer Semantic Context Similarity Score',
                                                                        'asset_description': 'Asset Description',
                                                                        'disclosure_date': 'Disclosure Date',
                                                                        'transaction_date': 'Transaction Date',
                                                                        'industry': 'Industry'
                                                                        })
industry_plus_committees_sorted = industry_plus_committees[['Member',
                                                              'Asset Description',
                                                              'Transaction Date',
                                                              'Disclosure Date',
                                                              'Industry',
                                                              'Subcommittee',
                                                              'BERT Transformer Semantic Context Similarity Score']].drop_duplicates()
industry_plus_committees_sorted = industry_plus_committees_sorted.sort_values('BERT Transformer Semantic Context Similarity Score', ascending = False)
industry_plus_committees_sorted

Unnamed: 0,Member,Asset Description,Transaction Date,Disclosure Date,Industry,Subcommittee,BERT Transformer Semantic Context Similarity Score
0,susie lee,Bausch Health Companies Inc.,2020-02-24,2020-04-28,Pharmaceuticals and Biotechnology,Energy and Mineral Resources,0.660035
8,susie lee,Bausch Health Companies Inc.,2020-08-27,2020-10-07,Pharmaceuticals and Biotechnology,"Water, Wildlife and Fisheries",0.660035
14,susie lee,Bausch Health Companies Inc.,2020-08-27,2020-10-07,Pharmaceuticals and Biotechnology,Oversight and Investigations,0.660035
13,susie lee,Bausch Health Companies Inc.,2020-01-06,2020-02-25,Pharmaceuticals and Biotechnology,Oversight and Investigations,0.660035
12,susie lee,Bausch Health Companies Inc.,2020-02-24,2020-04-28,Pharmaceuticals and Biotechnology,Oversight and Investigations,0.660035
11,susie lee,Bausch Health Companies Inc.,2020-08-27,2020-10-07,Pharmaceuticals and Biotechnology,Indian and Insular Affairs,0.660035
1,susie lee,Bausch Health Companies Inc.,2020-01-06,2020-02-25,Pharmaceuticals and Biotechnology,Energy and Mineral Resources,0.660035
9,susie lee,Bausch Health Companies Inc.,2020-02-24,2020-04-28,Pharmaceuticals and Biotechnology,Indian and Insular Affairs,0.660035
10,susie lee,Bausch Health Companies Inc.,2020-01-06,2020-02-25,Pharmaceuticals and Biotechnology,Indian and Insular Affairs,0.660035
7,susie lee,Bausch Health Companies Inc.,2020-01-06,2020-02-25,Pharmaceuticals and Biotechnology,"Water, Wildlife and Fisheries",0.660035


### Now, correlate the Industry with the Sub-Committees

In [None]:
industry_plus_scommittees = correlate(transactions_link_committees.head(30), 'industry_description', 'subcommittee_description')
industry_plus_scommittees

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col1] = df[col1].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col2] = df[col2].astype(str)


Unnamed: 0,index,Member,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,industry,sector,Comittee_code,Member_name,Committee,Subcommittee,Subcommittee_code,committee_description,subcommittee_description,industry_description,0
0,0,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...,Companies involved in the production of pharma...,0.699247
1,1,susie lee,2020-02-25,2020-01-06,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...,Companies involved in the production of pharma...,0.699247
2,2,susie lee,2020-10-07,2020-08-27,joint,BHC,Bausch Health Companies Inc.,sale_full,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Energy and Mineral Resources,HSII06,"Manages federal lands, oversees energy and min...",Addresses legislation concerning energy and mi...,Companies involved in the production of pharma...,0.699247
3,3,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Federal Lands,HSII10,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...,0.648826
4,4,susie lee,2020-02-25,2020-01-06,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Federal Lands,HSII10,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...,0.648826
5,5,susie lee,2020-10-07,2020-08-27,joint,BHC,Bausch Health Companies Inc.,sale_full,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Federal Lands,HSII10,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...,0.648826
6,6,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,"Water, Wildlife and Fisheries",HSII13,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...,0.674148
7,7,susie lee,2020-02-25,2020-01-06,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,"Water, Wildlife and Fisheries",HSII13,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...,0.674148
8,8,susie lee,2020-10-07,2020-08-27,joint,BHC,Bausch Health Companies Inc.,sale_full,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,"Water, Wildlife and Fisheries",HSII13,"Manages federal lands, oversees energy and min...",Concerned with legislation and oversight relat...,Companies involved in the production of pharma...,0.674148
9,9,susie lee,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Pharmaceuticals and Biotechnology,Health Care,HSII,susie lee,House Committee on Natural Resources,Indian and Insular Affairs,HSII24,"Manages federal lands, oversees energy and min...",Focuses on legislative matters pertaining to i...,Companies involved in the production of pharma...,0.636959


In [None]:
industry_plus_scommittees = industry_plus_scommittees.rename(columns = {0: 'BERT Transformer Semantic Context Similarity Score',
                                                                        'asset_description': 'Asset Description',
                                                                        'disclosure_date': 'Disclosure Date',
                                                                        'transaction_date': 'Transaction Date',
                                                                        'industry': 'Industry'
                                                                        })
industry_plus_scommittees_sorted = industry_plus_scommittees[['Member',
                                                              'Asset Description',
                                                              'Transaction Date',
                                                              'Disclosure Date',
                                                              'Industry',
                                                              'Subcommittee',
                                                              'BERT Transformer Semantic Context Similarity Score']].drop_duplicates().sort_values('BERT Transformer Semantic Context Similarity Score', ascending = False)
industry_plus_scommittees_sorted

Unnamed: 0,Member,Asset Description,Transaction Date,Disclosure Date,Industry,Subcommittee,BERT Transformer Semantic Context Similarity Score
15,susie lee,Bausch Health Companies Inc.,2020-02-24,2020-04-28,Pharmaceuticals and Biotechnology,"Agriculture, Rural Development, Food and Drug ...",0.703555
17,susie lee,Bausch Health Companies Inc.,2020-08-27,2020-10-07,Pharmaceuticals and Biotechnology,"Agriculture, Rural Development, Food and Drug ...",0.703555
16,susie lee,Bausch Health Companies Inc.,2020-01-06,2020-02-25,Pharmaceuticals and Biotechnology,"Agriculture, Rural Development, Food and Drug ...",0.703555
1,susie lee,Bausch Health Companies Inc.,2020-01-06,2020-02-25,Pharmaceuticals and Biotechnology,Energy and Mineral Resources,0.699247
0,susie lee,Bausch Health Companies Inc.,2020-02-24,2020-04-28,Pharmaceuticals and Biotechnology,Energy and Mineral Resources,0.699247
2,susie lee,Bausch Health Companies Inc.,2020-08-27,2020-10-07,Pharmaceuticals and Biotechnology,Energy and Mineral Resources,0.699247
26,susie lee,Bausch Health Companies Inc.,2020-08-27,2020-10-07,Pharmaceuticals and Biotechnology,"Energy and Water Development, and Related Agen...",0.688917
25,susie lee,Bausch Health Companies Inc.,2020-01-06,2020-02-25,Pharmaceuticals and Biotechnology,"Energy and Water Development, and Related Agen...",0.688917
24,susie lee,Bausch Health Companies Inc.,2020-02-24,2020-04-28,Pharmaceuticals and Biotechnology,"Energy and Water Development, and Related Agen...",0.688917
7,susie lee,Bausch Health Companies Inc.,2020-01-06,2020-02-25,Pharmaceuticals and Biotechnology,"Water, Wildlife and Fisheries",0.674148


In [None]:
industry_plus_scommittees_sorted

Unnamed: 0,Member,Asset Description,Transaction Date,Disclosure Date,Industry,Subcommittee,BERT Transformer Semantic Context Similarity Score
17,susie lee,Bausch Health Companies Inc.,2020-08-27,2020-10-07,Pharmaceuticals and Biotechnology,"Agriculture, Rural Development, Food and Drug ...",0.703555
16,susie lee,Bausch Health Companies Inc.,2020-01-06,2020-02-25,Pharmaceuticals and Biotechnology,"Agriculture, Rural Development, Food and Drug ...",0.703555
15,susie lee,Bausch Health Companies Inc.,2020-02-24,2020-04-28,Pharmaceuticals and Biotechnology,"Agriculture, Rural Development, Food and Drug ...",0.703555
0,susie lee,Bausch Health Companies Inc.,2020-02-24,2020-04-28,Pharmaceuticals and Biotechnology,Energy and Mineral Resources,0.699247
1,susie lee,Bausch Health Companies Inc.,2020-01-06,2020-02-25,Pharmaceuticals and Biotechnology,Energy and Mineral Resources,0.699247
2,susie lee,Bausch Health Companies Inc.,2020-08-27,2020-10-07,Pharmaceuticals and Biotechnology,Energy and Mineral Resources,0.699247
6,susie lee,Bausch Health Companies Inc.,2020-02-24,2020-04-28,Pharmaceuticals and Biotechnology,"Water, Wildlife and Fisheries",0.674148
7,susie lee,Bausch Health Companies Inc.,2020-01-06,2020-02-25,Pharmaceuticals and Biotechnology,"Water, Wildlife and Fisheries",0.674148
8,susie lee,Bausch Health Companies Inc.,2020-08-27,2020-10-07,Pharmaceuticals and Biotechnology,"Water, Wildlife and Fisheries",0.674148
18,susie lee,Bausch Health Companies Inc.,2020-02-24,2020-04-28,Pharmaceuticals and Biotechnology,"Commerce, Justice, Science, and Related Agencies",0.652289


# Transactions Link to Bills

# Transactions Link to Statements

In [None]:
text_source_1 = "Text one is sad"
text_source_2 = "Text two is quite enlightened by the current circumstances"
text_source_3 = "Text three likes"
query = "Text one is happy"


# Load a pre-trained model and tokenizer
model_name = "bert-base-uncased"  # You can change the model as needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Tokenize the text data
input_ids_1 = tokenizer(text_source_1, return_tensors="pt")['input_ids']
input_ids_2 = tokenizer(text_source_2, return_tensors="pt")['input_ids']
input_ids_3 = tokenizer(text_source_3, return_tensors="pt")['input_ids']

# Get embeddings for the text data
with torch.no_grad():
    embeddings_1 = model(input_ids_1)['last_hidden_state'].mean(dim=1).numpy()
    embeddings_2 = model(input_ids_2)['last_hidden_state'].mean(dim=1).numpy()
    embeddings_3 = model(input_ids_3)['last_hidden_state'].mean(dim=1).numpy()

# Calculate cosine similarity for semantic similarity
similarity_1_2 = cosine_similarity(embeddings_1, embeddings_2)[0][0]
similarity_1_3 = cosine_similarity(embeddings_1, embeddings_3)[0][0]
similarity_2_3 = cosine_similarity(embeddings_2, embeddings_3)[0][0]

print(f"Semantic Similarity between Source 1 and Source 2: {similarity_1_2:.4f}")
print(f"Semantic Similarity between Source 1 and Source 3: {similarity_1_3:.4f}")
print(f"Semantic Similarity between Source 2 and Source 3: {similarity_2_3:.4f}")


Semantic Similarity between Source 1 and Source 2: 0.6033
Semantic Similarity between Source 1 and Source 3: 0.7766
Semantic Similarity between Source 2 and Source 3: 0.4765
