<a href="https://colab.research.google.com/github/hanansuk/guns_n_roses/blob/main/lme_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing Input Data for Modeling
Written by Hannah George

## Imports

In [1]:
from datetime import datetime
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Reading in the Data

In [2]:
deaths = pd.read_csv('cdc_monthly_state_gun_deaths_imputed.csv')
laws = pd.read_csv('RAND.csv')

In [3]:
deaths.head()

Unnamed: 0.1,Unnamed: 0,state,year,period,monthly_gun_deaths
0,0,Alabama,2021,2021-01-01,122.0
1,1,Alabama,2021,2021-02-01,104.0
2,2,Alabama,2021,2021-03-01,103.0
3,3,Alabama,2021,2021-04-01,102.0
4,4,Alabama,2021,2021-05-01,108.0


In [4]:
laws.head()

Unnamed: 0,Law.ID,State,State.Postal.Abbreviation,FIPS.Code,Law.Class..num.,Law.Class,Law.Class.Subtype,Handguns.or.Long.Guns,Effect,Type.of.Change,...,Supersession.Date,Supersession.Date.Year,Supersession.Date.Month,Supersession.Date.Day,Controlling.Law.at.Beginning.of.Period..1979.,Age.for.Minimum.Age.Laws,Length.of.Waiting.Period..days..handguns.,Additional.Context.and.Notes,Caveats.and.Ambiguities,Exception.Code
0,AK1001,Alaska,AK,2,1,background checks,private sales,handgun,,,...,,,,,1.0,,,,,
1,AK1002,Alaska,AK,2,2,carrying a concealed weapon (ccw),prohibited,handgun,Restrictive,Implement,...,1994-10-01,1994.0,10.0,1.0,1.0,,,Prior law prohibiting concealed carry enacted ...,,
2,AK1003,Alaska,AK,2,2,carrying a concealed weapon (ccw),shall issue,handgun,Permissive,Modify,...,2003-09-09,2003.0,9.0,9.0,,,,,,
3,AK1004,Alaska,AK,2,2,carrying a concealed weapon (ccw),shall issue (permit not required),handgun,Permissive,Modify,...,,,,,,,,Permitting system maintained for residents see...,,
4,AK1005,Alaska,AK,2,3,castle doctrine,,handgun and/or long gun,Permissive,Modify,...,2006-09-13,2006.0,9.0,13.0,1.0,,,See 2006 S.B. No. 200 Ch. 68.,,


## Data Preprocessing

### Filtering Laws Dataset to Reduce Volume

In [5]:
filtered_laws = laws[laws['Type.of.Change'].isin(['Permissive', 'Implement'])].reset_index().copy()

### Converting Dates to Datetime Objects

In [6]:
deaths['period'] = pd.to_datetime(deaths.period)
filtered_laws['Effective.Date'] = pd.to_datetime(filtered_laws['Effective.Date'])
filtered_laws['Supersession.Date'] = pd.to_datetime(filtered_laws['Supersession.Date'])

In [7]:
# If the law has not been superseeded then set the date to the future.
filtered_laws['Supersession.Date'] = filtered_laws['Supersession.Date'].fillna('2099-12-01')

### Creating Lagged Monthly Gun Deaths Variable

In [8]:
deaths['prior_monthly_deaths'] = deaths.groupby(['state'])['monthly_gun_deaths'].shift(1)

### Using NMF Topic Modeling to Transform Gun Laws

In [9]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(filtered_laws.Content.to_list())

  idf = np.log(n_samples / df) + 1


In [10]:
n_topics = 20
nmf = NMF(
    n_components=n_topics,
    init='nndsvd'
).fit(tfidf)

In [11]:
topic_col_names = [f'topic_{i}' for i in range(n_topics)]
gun_law_topics = pd.concat([filtered_laws, pd.DataFrame(data=nmf.transform(tfidf), columns=topic_col_names)], axis=1)

In [12]:
gun_law_topics.head()

Unnamed: 0,index,Law.ID,State,State.Postal.Abbreviation,FIPS.Code,Law.Class..num.,Law.Class,Law.Class.Subtype,Handguns.or.Long.Guns,Effect,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,1,AK1002,Alaska,AK,2,2,carrying a concealed weapon (ccw),prohibited,handgun,Restrictive,...,0.014774,0.0,0.017702,0.176787,0.0,0.0,0.0,0.065197,0.0,0.0
1,8,AK1009,Alaska,AK,2,7,minimum age,youth possession,long gun,Restrictive,...,0.0,0.0,0.0,0.032529,0.187036,0.0,0.0,0.053286,0.0069,0.0
2,10,AK1011,Alaska,AK,2,7,minimum age,purchase and sale,long gun,Restrictive,...,0.0,0.0,0.0,0.092099,0.0,0.0,0.0,0.143203,0.018687,0.0
3,13,AK1015,Alaska,AK,2,7,minimum age,youth possession,handgun,Restrictive,...,0.0,0.0,0.0,0.032529,0.187036,0.0,0.0,0.053286,0.0069,0.0
4,14,AK1016,Alaska,AK,2,7,minimum age,purchase and sale,handgun,Restrictive,...,0.0,0.0,0.0,0.092099,0.0,0.0,0.0,0.143203,0.018687,0.0


In [13]:
# Print the top 10 words
n_words = 8
feature_names = tfidf_vectorizer.get_feature_names_out()

topic_list = []
for topic_idx, topic in enumerate(nmf.components_):
    top_n = [feature_names[i]
             for i in topic.argsort()
             [-n_words:]][::-1]
    top_features = ' '.join(top_n)
    topic_list.append(f"topic_{'_'.join(top_n[:3])}")

    print(f"Topic {topic_idx}: {top_features}")

Topic 0: transferee statement officer law chief date enforcement transferor
Topic 1: doctrine castle century common united states law 1920s
Topic 2: age years 18 eighteen person possess 21 handgun
Topic 3: licensee licensed transferee 103 established instant national section
Topic 4: order respondent court protection petition risk extreme petitioner
Topic 5: permit sheriff applicant application issue shall license police
Topic 6: school college university public private grounds property educational
Topic 7: theft loss report firearm stolen agency police occurred
Topic 8: application applicant card authority illness time issued identification
Topic 9: pistol revolver person 29 carry license issued shall
Topic 10: mental committed mentally institution adjudicated incompetent possess person
Topic 11: firearms ammunition ordinance ownership components transportation regulation possession
Topic 12: bequest place acquire procured inheritance firearm business owner
Topic 13: weapon concealed 

In [16]:
topic_csv = gun_law_topics.copy()
topic_csv.rename({
    'Law.ID': 'law_id',
    'State': 'state',
    'Content': 'content'}, axis=1, inplace=True)
topic_csv = topic_csv[['law_id', 'state', 'content'] + topic_col_names]

Unnamed: 0,law_id,state,content,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,AK1002,Alaska,It is unlawful for a person to carry concealed...,0.0,0.0,0.004575,0.0,0.0,0.0,0.0,...,0.014774,0.0,0.017702,0.176787,0.0,0.0,0.0,0.065197,0.0,0.0
1,AK1009,Alaska,Alaska Stat. § 11.61.220(a)(3): A person commi...,0.0,0.0,0.033256,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.032529,0.187036,0.0,0.0,0.053286,0.0069,0.0
2,AK1011,Alaska,A person commits the crime of misconduct invol...,0.0,0.0,0.085952,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.092099,0.0,0.0,0.0,0.143203,0.018687,0.0
3,AK1015,Alaska,Alaska Stat. § 11.61.220(a)(3): A person commi...,0.0,0.0,0.033256,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.032529,0.187036,0.0,0.0,0.053286,0.0069,0.0
4,AK1016,Alaska,A person commits the crime of misconduct invol...,0.0,0.0,0.085952,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.092099,0.0,0.0,0.0,0.143203,0.018687,0.0


### Creating a row for each law active during the time period

In [None]:
def find_active_laws(row: pd.Series):
    this_state_laws = gun_law_topics[gun_law_topics.State == row.state].copy()
    all_active_laws = this_state_laws[(row.period >= this_state_laws['Effective.Date']) & (row.period < this_state_laws['Supersession.Date'])].copy()
    if len(all_active_laws) > 0:
        all_active_laws = all_active_laws.assign(state = row.state)
        all_active_laws = all_active_laws.assign(period = row.period)
        all_active_laws = all_active_laws.assign(monthly_gun_deaths = row.monthly_gun_deaths)
        all_active_laws = all_active_laws.assign(prior_monthly_deaths = row.prior_monthly_deaths)
        return all_active_laws

res = map(lambda row: find_active_laws(row[1]), deaths.iterrows())
res = pd.concat(res)

In [None]:
res.head()

### Filtering to Necessary Columns

In [None]:
maybe_columns = ['Law.Class', 'Law.Class.Subtype', 'Handguns.or.Long.Guns', 'Effect']
needed_columns = ['Law.ID',  'state', 'period', 'monthly_gun_deaths',
                  'prior_monthly_deaths'] + topic_col_names
all_columns = needed_columns + maybe_columns
output = res[all_columns].dropna().copy()
output.rename({
    'Law.ID': 'law_id',
    'Law.Class': 'law_class',
    'Law.Class.Subtype': 'law_class_subtype',
    'Handguns.or.Long.Guns': 'gun_type',
    'Effect': 'effect'
}, axis=1, inplace=True)

In [None]:
output.head()

## Saving Results to CSV

In [None]:
output.to_csv('model_ready.csv')

In [18]:
topic_csv.to_csv('gun_law_topic_scores.csv', index=False)