In [None]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from google.colab import drive
import re
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#read in files
df= pd.read_csv('/content/drive/My Drive/Colab Notebooks/app_data_communications_2025.csv')
df_119 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/119th_congress_bioguideids.csv')

In [None]:
#function to clean 119_congress_bioguideids file
def clean_119(df):

  #make first row the column names
  df.columns = df.iloc[0]
  df = df_119[1:].reset_index(drop=True)

  #rename NaN column
  df.columns = ['bioguide_id' if pd.isna(col) else col for col in df.columns]

  #create full_name column from Name column
  name_only = df['Name'].str.split(' - ').str[0]
  df['full_name'] = name_only.apply(lambda x: ' '.join(x.split(',')[1].strip().split()[:1]) + ' ' + x.split(',')[0].strip())

  #only include bioguide_id column
  df = df[['bioguide_id', 'full_name']]

  return df

df_bioguide = clean_119(df_119)
df_bioguide

Unnamed: 0,bioguide_id,full_name
0,A000370,Alma Adams
1,A000055,Robert Aderholt
2,A000371,Pete Aguilar
3,A000379,Mark Alford
4,A000372,Rick Allen
...,...,...
533,W000800,Peter Welch
534,W000802,Sheldon Whitehouse
535,W000437,Roger Wicker
536,W000779,Ron Wyden


In [None]:
#function to clean app_communications_2025 file
def clean_communications_app(df):

  #filter only necessary columns
  df= df[['bioguide_id','attack_personal','outcome_bipartisanship','policy','first_name','last_name']]

  #sum bioguide_id column to get total communication count
  df['communication_count']= df.groupby('bioguide_id')['bioguide_id'].transform('count')

  #create full name column by combining first_name and last_name
  df['full_name']= df['first_name'] + ' ' + df['last_name']

  #sum atttack_personal and outcome_bipartisanship column by biogude_id
  df = df.groupby(['bioguide_id','full_name','communication_count'], as_index=False).sum(['attack_personal','outcome_bipartisanship','policy'])

  #create pct columns
  df['attack_personal_pct']= df['attack_personal']/df['communication_count']*100
  df['outcome_bipartisanship_pct']= df['outcome_bipartisanship']/df['communication_count']*100
  df['policy_pct']= df['policy']/df['communication_count']*100

  return df

#drop first_name and last_name columns
df_comm_app_2025 = clean_communications_app(df).sort_values(by='bioguide_id', ascending=True).reset_index(drop=True)
df_comm_app_2025

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['communication_count']= df.groupby('bioguide_id')['bioguide_id'].transform('count')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['full_name']= df['first_name'] + ' ' + df['last_name']


Unnamed: 0,bioguide_id,full_name,communication_count,attack_personal,outcome_bipartisanship,policy,attack_personal_pct,outcome_bipartisanship_pct,policy_pct
0,A000055,Robert Aderholt,71,1,2,37,1.408451,2.816901,52.112676
1,A000148,Jake Auchincloss,473,26,57,337,5.496829,12.050740,71.247357
2,A000369,Mark Amodei,111,1,14,89,0.900901,12.612613,80.180180
3,A000370,Alma Adams,38,1,1,21,2.631579,2.631579,55.263158
4,A000371,Pete Aguilar,38,0,1,23,0.000000,2.631579,60.526316
...,...,...,...,...,...,...,...,...,...
486,W000825,Jennifer Wexton,3,0,0,1,0.000000,0.000000,33.333333
487,W000828,Brandon Williams,1,0,0,0,0.000000,0.000000,0.000000
488,Y000064,Todd Young,374,3,61,261,0.802139,16.310160,69.786096
489,Y000067,Rudy Yakym,339,8,11,171,2.359882,3.244838,50.442478


In [None]:
#export csv
df_comm_app_2025.to_csv('/content/drive/My Drive/Colab Notebooks/app_communications_2025_jc.csv', index=False)

In [None]:
#merge df_bioguide and clean_df
source_F1 = pd.merge(df_bioguide, df_comm_app_2025, on=['bioguide_id', 'full_name'], how='left').sort_values(by='bioguide_id', ascending=True).reset_index(drop=True)
source_F1

Unnamed: 0,bioguide_id,full_name,communication_count,attack_personal,outcome_bipartisanship,attack_personal_pct,outcome_bipartisanship_pct
0,A000055,Robert Aderholt,71.0,1.0,2.0,1.408451,2.816901
1,A000148,Jake Auchincloss,473.0,26.0,57.0,5.496829,12.050740
2,A000369,Mark Amodei,111.0,1.0,14.0,0.900901,12.612613
3,A000370,Alma Adams,38.0,1.0,1.0,2.631579,2.631579
4,A000371,Pete Aguilar,38.0,0.0,1.0,0.000000,2.631579
...,...,...,...,...,...,...,...
533,W000829,Tony Wied,,,,,
534,W000830,George Whitesides,,,,,
535,Y000064,Todd Young,374.0,3.0,61.0,0.802139,16.310160
536,Y000067,Rudy Yakym,339.0,8.0,11.0,2.359882,3.244838


In [None]:
#normalization function
def normalize(df, col_name, norm_column_name):
    if norm_column_name is None:
        norm_column_name = f'norm_{col_name}'

    # Fill missing values with the column mean
    mean_val = round(df[col_name].mean(), 2)
    df[col_name] = df[col_name].fillna(mean_val)

    # Calculate mean and std for normalization
    mean_val = round(df[col_name].mean(), 2)
    std_val = round(df[col_name].std(), 2)

    # Normalize using CDF
    df[norm_column_name] = norm.cdf(df[col_name], mean_val, std_val) * 100

    # Remove duplicates based on 'bioguide_id'
    if df['bioguide_id'].duplicated().sum() > 0:
        df.drop_duplicates(subset='bioguide_id', inplace=True)

    return df

cols_to_normalize = [
    ('attack_personal', 'attack_personal_norm'),
    ('outcome_bipartisanship', 'outcome_bipartisanship_norm'),
    ('attack_personal_pct', 'attack_personal_pct_norm'),
    ('outcome_bipartisanship_pct', 'outcome_bipartisanship_pct_norm')]

df_comm_app_norm = source_F1.copy()

for col, norm_col in cols_to_normalize:
    df_comm_app_norm = normalize(df_comm_app_norm, col, norm_col)

df_comm_app_norm

Unnamed: 0,bioguide_id,full_name,communication_count,attack_personal,outcome_bipartisanship,attack_personal_pct,outcome_bipartisanship_pct,attack_personal_norm,outcome_bipartisanship_norm,attack_personal_pct_norm,outcome_bipartisanship_pct_norm
0,A000055,Robert Aderholt,71.0,1.00,2.00,1.408451,2.816901,29.884105,16.216277,27.970282,16.257793
1,A000148,Jake Auchincloss,473.0,26.00,57.00,5.496829,12.050740,65.424972,95.941353,67.317384,89.172514
2,A000369,Mark Amodei,111.0,1.00,14.00,0.900901,12.612613,29.884105,34.824075,23.826530,91.478452
3,A000370,Alma Adams,38.0,1.00,1.00,2.631579,2.631579,29.884105,15.027942,39.171428,15.186520
4,A000371,Pete Aguilar,38.0,0.00,1.00,0.000000,2.631579,28.613248,15.027942,17.376426,15.186520
...,...,...,...,...,...,...,...,...,...,...,...
533,W000829,Tony Wied,,15.27,21.86,3.720000,6.910000,50.000000,50.000000,50.000000,50.000000
534,W000830,George Whitesides,,15.27,21.86,3.720000,6.910000,50.000000,50.000000,50.000000,50.000000
535,Y000064,Todd Young,374.0,3.00,61.00,0.802139,16.310160,32.499640,97.395757,23.061178,98.807863
536,Y000067,Rudy Yakym,339.0,8.00,11.00,2.359882,3.244838,39.401835,29.495798,36.562466,18.914575


In [None]:
# Save DataFrame to CSV
df_comm_app_norm.to_csv('/content/drive/My Drive/Bridge Pledge/csv_files_outputs/app_communications.csv', index=False)