In [141]:
import json
import os
import pandas as pd
from pandas.io.json import json_normalize
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from typing import List

# Setup sql
engine = create_engine("sqlite:///classification_unfiltered.db")

session = sessionmaker()
session.configure(bind=engine)
s = session()

dirname = "../congresstweets/data"

# First create tweets table of unique tweets

In [74]:
engine.execute(
    "CREATE TABLE IF NOT EXISTS tweets(id TEXT PRIMARY KEY, text TEXT, screen_name TEXT, user_id TEXT)"
)

<sqlalchemy.engine.result.ResultProxy at 0x106896490>

In [75]:
# tweets table
for fname in [f for f in os.listdir(dirname) if f.endswith("json")]:
    temp_df = (
        pd.read_json(os.path.join(dirname, fname))
        .dropna()
        .replace("\n", " ", regex=True)
    )

    temp_df[["id", "text", "screen_name", "user_id"]].to_sql(
        "temp_table",
        con=engine,
        index=False,
        index_label="id",
        if_exists="replace",
        chunksize=1000,
    )

    del temp_df
    
    insert_into_sql = "INSERT OR IGNORE INTO tweets SELECT * FROM temp_table"
    engine.execute(insert_into_sql)

s.commit()

# now get the screen names from the tweets table and figure out who they belong to


In [142]:
result = engine.execute(
    "SELECT screen_name, COUNT(*) from tweets GROUP BY screen_name")

In [143]:
screen_names = []
counts = []

for r in result:
    screen_names.append(r[0])
    counts.append(r[1])

In [144]:
len(screen_names)

1431

In [145]:
screen_names[:10]

['ABrindisiNY',
 'AOC',
 'ASEANCaucus',
 'Abby4Iowa',
 'AdamKinzinger',
 'AdamSchiff',
 'AdamSchiffCA',
 'AlLawsonJr',
 'AlbioSiresNJ',
 'AlmaforCongress']

# Load the metadata

In [79]:
def extract_legis_metadata(fn: str) -> pd.DataFrame:
    with open(fn, "r") as f:
        dat = json.load(f)
    df = json_normalize(dat)

    # fix ridiculous nested dict/list/idct
    # "terms" column contains dicts, break em out!
    # we are taking the LAST (MOST RECENT) term for everyone
    terms = pd.DataFrame([i[-1] for i in df.terms.tolist()])
    terms = terms[["type", "state", "party"]]

    df = pd.concat([df, terms], axis=1)[["id.bioguide", "type", "state", "party"]]
    df.columns = ["id_bioguide", "type", "state", "party"]

    return df

In [80]:
# get social media handle - legislator mapping
with open("legislators-social-media.json", "r") as f:
    sm_data = json.load(f)
    
legislator_sm_df = json_normalize(sm_data)[
    ["id.bioguide", "social.twitter_id", "social.twitter"]
]
legislator_sm_df.columns = ["id_bioguide", "twitter_id", "twitter_screenname"]

# needs to be str because the numbers are too large
legislator_sm_df["twitter_id"] = legislator_sm_df["twitter_id"].astype(str)

# need to lowercase for matching
legislator_sm_df["twitter_screenname"] = legislator_sm_df[
    "twitter_screenname"
].str.lower()

In [81]:
legislator_sm_df.head()

Unnamed: 0,id_bioguide,twitter_id,twitter_screenname
0,R000600,3026622545,repamata
1,Y000064,234128524,sentoddyoung
2,E000295,2856787757,senjoniernst
3,T000476,2964174789,senthomtillis
4,Y000062,384913290,repjohnyarmuth


In [82]:
legislator_sm_df.shape

(532, 3)

In [83]:
legislator_sm_df[legislator_sm_df.isnull().any(axis=1)]

Unnamed: 0,id_bioguide,twitter_id,twitter_screenname
209,C001049,,
239,A000367,,
310,K000384,,
412,C001108,,
431,G000584,,


In [84]:
# get legislator - party mapping
current_legis = extract_legis_metadata("legislators-current.json")
historical_legis = extract_legis_metadata(
    "legislators-historical.json"
)

In [85]:
current_legis.tail()

Unnamed: 0,id_bioguide,type,state,party
530,G000592,rep,ME,Democrat
531,K000395,rep,PA,Republican
532,B001311,rep,NC,Republican
533,M001210,rep,NC,Republican
534,L000594,sen,GA,Republican


In [86]:
historical_legis.tail()

Unnamed: 0,id_bioguide,type,state,party
11977,C000984,rep,MD,Democrat
11978,H001087,rep,CA,Democrat
11979,I000055,sen,GA,Republican
11980,H001048,rep,CA,Republican
11981,M001187,rep,NC,Republican


In [87]:
current_legis[current_legis.isnull().any(axis=1)] # no nulls here

Unnamed: 0,id_bioguide,type,state,party


In [88]:
historical_legis[historical_legis.isnull().any(axis=1)].shape

(234, 4)

In [89]:
# is there overlap?

set(current_legis.id_bioguide.unique().tolist()).intersection(
set(historical_legis.id_bioguide.unique().tolist())
)

set()

In [90]:
# combine legislator metadata
all_legislators_metadata_df = pd.concat([current_legis, historical_legis])

In [91]:
# join in order to add state/party metadata to social media df

combined_metadata = legislator_sm_df.merge(
    all_legislators_metadata_df,
    how="left",
    left_on="id_bioguide",
    right_on="id_bioguide",
)


In [92]:
combined_metadata.shape

(532, 6)

In [93]:
combined_metadata.tail()

Unnamed: 0,id_bioguide,twitter_id,twitter_screenname,type,state,party
527,H001089,1080960924687704064,senhawleypress,sen,MO,Republican
528,V000133,1083469084648505344,congressmanjvd,rep,NJ,Republican
529,K000395,1136060761422405633,repfredkeller,rep,PA,Republican
530,S001204,346509049,guamcongressman,rep,GU,Democrat
531,L000594,1200451909406121984,senatorloeffler,sen,GA,Republican


In [94]:
# where do we have missing data?

combined_metadata[combined_metadata.isnull().any(axis=1)]

Unnamed: 0,id_bioguide,twitter_id,twitter_screenname,type,state,party
209,C001049,,,rep,MO,Democrat
239,A000367,,,rep,MI,Independent
310,K000384,,,sen,VA,Democrat
412,C001108,,,rep,KY,Republican
431,G000584,,,rep,MT,Republican


In [95]:
combined_metadata[~combined_metadata.isnull().any(axis=1)].shape

(527, 6)

In [96]:
print(f"combined_metadata before dropping NAs {combined_metadata.shape}")

combined_metadata.dropna(inplace=True)  # drop anyone with incomplete metadata

print(f"combined_metadata after dropping NAs {combined_metadata.shape}")


combined_metadata before dropping NAs (532, 6)
combined_metadata after dropping NAs (527, 6)


# Match this metadata with the list of screen names from the tweets table

In [146]:
tweet_authors = pd.DataFrame({"screen_name": [s.lower() for s in sorted(screen_names)],
                             "count": counts})

In [153]:
df = tweet_authors.merge(combined_metadata,
                        how = "left",
                        left_on = "screen_name",
                        right_on = "twitter_screenname")

In [154]:
df[df["party"].isnull()].to_csv("missing_metadata.csv")

In [155]:
df[~df["party"].isnull()].shape

(529, 8)

# then...we filled in the missing metadata manually