# Clean the data into a usable format
There are presently a few issues with the data. First, names are not much use in the analysis that we plan to do therefore, all information beside the number of republicans or democrats in each chamber will be removed. Once removed, I will tally up the number of each party members in each chamber where necessary and separate the columns into usable formats. 

In [1]:
import pandas as pd
import numpy as np
import re, os

data_path = "C:/Users/SpiffyApple/Documents/USC/RaphaelBostic/policy_diffusion"

In [99]:
df = pd.read_csv("/".join([data_path, "states_party_strength.csv"]), encoding = "ISO-8859-1")
df.head()

Unnamed: 0.1,Unnamed: 0,year,congress house,congress sen class 1,congress sen class 2,congress sen class 3,electoral,governor,legislature house,legislature sen
0,Alabama,1980,"4D, 3R",,Howell Heflin (D),Donald W. Stewart (D),Ronald Reagan and George H. W. Bush (R) Y,Fob James (D),"101D, 4R",35D
1,Alabama,1981,"4D, 3R",,Howell Heflin (D),Jeremiah Denton (R),Ronald Reagan and George H. W. Bush (R) Y,Fob James (D),"101D, 4R",35D
2,Alabama,1982,"4D, 3R",,Howell Heflin (D),Jeremiah Denton (R),Ronald Reagan and George H. W. Bush (R) Y,Fob James (D),"101D, 4R",35D
3,Alabama,1983,"5D, 2R",,Howell Heflin (D),Jeremiah Denton (R),Ronald Reagan and George H. W. Bush (R) Y,George Wallace (D),"97D, 8R","32D, 3R"
4,Alabama,1984,"5D, 2R",,Howell Heflin (D),Jeremiah Denton (R),Ronald Reagan and George H. W. Bush (R) Y,George Wallace (D),"87D, 18R","29D, 3R, 3I"


In [100]:
df.rename(columns = {"Unnamed: 0":'state'}, inplace=True)

In [101]:
## set the data index for convenience and efficiency
mlt_idx = pd.MultiIndex.from_arrays([df.state.str.lower(), df.year])
df.set_index(mlt_idx, inplace=True)
df.sort_index(inplace=True)
df.drop(['state', 'year'], inplace=True, axis=1)

In [102]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,congress house,congress sen class 1,congress sen class 2,congress sen class 3,electoral,governor,legislature house,legislature sen
state,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
alabama,1980,"4D, 3R",,Howell Heflin (D),Donald W. Stewart (D),Ronald Reagan and George H. W. Bush (R) Y,Fob James (D),"101D, 4R",35D
alabama,1981,"4D, 3R",,Howell Heflin (D),Jeremiah Denton (R),Ronald Reagan and George H. W. Bush (R) Y,Fob James (D),"101D, 4R",35D
alabama,1982,"4D, 3R",,Howell Heflin (D),Jeremiah Denton (R),Ronald Reagan and George H. W. Bush (R) Y,Fob James (D),"101D, 4R",35D
alabama,1983,"5D, 2R",,Howell Heflin (D),Jeremiah Denton (R),Ronald Reagan and George H. W. Bush (R) Y,George Wallace (D),"97D, 8R","32D, 3R"
alabama,1984,"5D, 2R",,Howell Heflin (D),Jeremiah Denton (R),Ronald Reagan and George H. W. Bush (R) Y,George Wallace (D),"87D, 18R","29D, 3R, 3I"


In [113]:
#functions to count the number of parties.
def party_counter(s, party = 'D'):
    ## count the number of each party
    if type(s) is str:
        num_found = len(re.findall(party, s.strip()))
        return(str(num_found)+party)
    else:
        return(s)
    
def party_finder(s):
    ## find the number of parties in the string
    if type(s) is str:
        p_types = set(re.findall("\w", s))
        return(p_types)
    else:
        return(s)

def remove_citations(s):
    if type(s) is str:
        return(re.sub("\[\d+\]","",s))
    else:
        return(s)
    
def name_remover(s):
    ## find a eliminate names
    if type(s) is str:
        s = remove_citations(s)
        if re.search("[\w]{2,}",s):
            match = re.findall("\((\w)\)", s)
            if match:
                return(",".join(match))
            else:
                return(s)
        else:
            return(s)  
    else:
        return(s)
    
def parties_counter(s):
    ## combine the above two functions into one that determines party types then counts occurences
    ## NOTE: this does not check for presence of names. Thus, "John Dill (R)" will end up being a hefty num of parties.
    if type(s) is str:
        s = remove_citations(s)
        if len(re.findall('\d+', s)) == 0: ##eliminates already counted cases
           
            s = name_remover(s)
            p_types = party_finder(s)
            p_counts = []
            for party_type in p_types:
                p_counts.append(party_counter(s, party_type))
            return(",".join(p_counts))
        else:
            return(s)
    else:
        return(s)

def party_count_parser(s, party='D'):
    ## enable to numerically interpret the data. ie 10D,5R isn't conducive to analysis while 10 or 5 is. 
    return(s)

In [104]:
## fix the column house congress to contain only counts
df['congress house'] = df['congress house'].apply(parties_counter)

In [117]:
## now, I want to apply the name remover function to the non-house fields. 
for col in df.columns[df.columns.str.contains("electoral|sen|governor")]:
    df.loc[:,col] = df.loc[:,col].apply(name_remover)

In [118]:
## now I want to combine the senatorial columns and do the party count, hence:

df['congress sen'] = df[['congress sen class 1','congress sen class 2','congress sen class 3']].fillna('').apply(lambda s: ",".join(s), axis=1)
df['congress sen'] = df['congress sen'].apply(parties_counter)

In [119]:
## one last item before the data are useable
df['legislature house'] = df['legislature house'].apply(remove_citations)

In [120]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,congress house,congress sen class 1,congress sen class 2,congress sen class 3,electoral,governor,legislature house,legislature sen,congress sen
state,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
alabama,1980,"4D, 3R",,D,D,R,D,"101D, 4R",35D,2D
alabama,1981,"4D, 3R",,D,R,R,D,"101D, 4R",35D,"1D,1R"
alabama,1982,"4D, 3R",,D,R,R,D,"101D, 4R",35D,"1D,1R"
alabama,1983,"5D, 2R",,D,R,R,D,"97D, 8R","32D, 3R","1D,1R"
alabama,1984,"5D, 2R",,D,R,R,D,"87D, 18R","29D, 3R, 3I","1D,1R"


In [121]:
df.to_csv("/".join([data_path,"states_party_strength_cleaned.csv"]), encoding = "‘utf-8’")