# Census Variables (Variable Types)

In [26]:
import pandas as pd

data = pd.read_csv('census_variables.csv')
print(data.head())

   Unnamed: 0 first_name  last_name birth_year  voted  num_children  \
0           0     Denise      Ratke       2005  False             0   
1           1       Hali  Cummerata       1987  False             0   
2           2    Salomon        Orn       1992   True             2   
3           3     Sarina   Schiller       1965  False             2   
4           4       Gust  Abernathy       1945  False             2   

   income_year      higher_tax marital_status  
0     92129.41        disagree         single  
1     75649.17         neutral       divorced  
2    166313.45           agree         single  
3     71704.81  strongly agree        married  
4    143316.08           agree        married  


### Checking for types

In [27]:
print(data.dtypes)

Unnamed: 0          int64
first_name         object
last_name          object
birth_year         object
voted                bool
num_children        int64
income_year       float64
higher_tax         object
marital_status     object
dtype: object


### Unique year values

In [28]:
print(data.birth_year.unique())

['2005' '1987' '1992' '1965' '1945' '1951' '1963' '1949' '1950' '1971'
 '2007' '1944' '1995' '1973' '1946' '1954' '1994' '1989' '1947' '1993'
 '1976' '1984' 'missing' '1966' '1941' '2000' '1953' '1956' '1960' '2001'
 '1980' '1955' '1985' '1996' '1968' '1979' '2006' '1962' '1981' '1959'
 '1977' '1978' '1983' '1957' '1961' '1982' '2002' '1998' '1999' '1952'
 '1940' '1986' '1958']


### Replacing missing values for birth year

In [29]:
data.birth_year = data.birth_year.replace(['missing'], 1967)
print(data.birth_year.unique())

['2005' '1987' '1992' '1965' '1945' '1951' '1963' '1949' '1950' '1971'
 '2007' '1944' '1995' '1973' '1946' '1954' '1994' '1989' '1947' '1993'
 '1976' '1984' 1967 '1966' '1941' '2000' '1953' '1956' '1960' '2001'
 '1980' '1955' '1985' '1996' '1968' '1979' '2006' '1962' '1981' '1959'
 '1977' '1978' '1983' '1957' '1961' '1982' '2002' '1998' '1999' '1952'
 '1940' '1986' '1958']


### Changing the data type for year

In [30]:
data.birth_year = data.birth_year.astype('int')
print(data.dtypes)

Unnamed: 0          int64
first_name         object
last_name          object
birth_year          int32
voted                bool
num_children        int64
income_year       float64
higher_tax         object
marital_status     object
dtype: object


### Year Average

In [31]:
print(data.birth_year.mean())

1973.4


### Higher_tax to Categorical type

In [32]:
data.higher_tax = pd.Categorical(data.higher_tax, ['strongly disagree', 'disagree', 'neutral', 'agree', 'strongly agree'], ordered = True)
print(data.higher_tax.unique())

['disagree', 'neutral', 'agree', 'strongly agree', 'strongly disagree']
Categories (5, object): ['strongly disagree' < 'disagree' < 'neutral' < 'agree' < 'strongly agree']


### Median of higher_tax

In [33]:
data.higher_tax = data.higher_tax.cat.codes
print(data.higher_tax)
print(data.higher_tax.median())

0     1
1     2
2     3
3     4
4     3
     ..
95    3
96    0
97    4
98    1
99    4
Name: higher_tax, Length: 100, dtype: int8
2.0


### One-Hot Encode to material_status variable

In [35]:
data = pd.get_dummies(data, columns = ['marital_status'])
print(data.head())

   Unnamed: 0 first_name  last_name  birth_year  voted  num_children  \
0           0     Denise      Ratke        2005  False             0   
1           1       Hali  Cummerata        1987  False             0   
2           2    Salomon        Orn        1992   True             2   
3           3     Sarina   Schiller        1965  False             2   
4           4       Gust  Abernathy        1945  False             2   

   income_year  higher_tax  marital_status_divorced  marital_status_married  \
0     92129.41           1                        0                       0   
1     75649.17           2                        1                       0   
2    166313.45           3                        0                       0   
3     71704.81           4                        0                       1   
4    143316.08           3                        0                       1   

   marital_status_single  marital_status_widowed  
0                      1                 

###  age_group

In [65]:
data['age_group'] = 2021-data.birth_year

def groups(item):
        if item <= 20: 
            return 'too young'
        elif item in range(21, 31):
            return 'young'
        elif item in range(31,41):
            return 'young adult'
        elif item in range(41, 51):
            return 'adult'
        elif item in range(51,61):
            return 'old'
        elif item >= 61:
            return 'too old'
data.age_group= data.age_group.apply(groups)
print(data.head())
        
                                                    
                        

   Unnamed: 0 first_name  last_name  birth_year  voted  num_children  \
0           0     Denise      Ratke        2005  False             0   
1           1       Hali  Cummerata        1987  False             0   
2           2    Salomon        Orn        1992   True             2   
3           3     Sarina   Schiller        1965  False             2   
4           4       Gust  Abernathy        1945  False             2   

   income_year  higher_tax  marital_status_divorced  marital_status_married  \
0     92129.41           1                        0                       0   
1     75649.17           2                        1                       0   
2    166313.45           3                        0                       0   
3     71704.81           4                        0                       1   
4    143316.08           3                        0                       1   

   marital_status_single  marital_status_widowed    age_group  
0                      1    

### Categorizing age-group

In [67]:
data.age_group = pd.Categorical(data.age_group, ['too young', 'young', 'young adult', 'adult', 'old', 'too old'], ordered = True)
print(data.age_group.unique())

['too young', 'young adult', 'young', 'old', 'too old', 'adult']
Categories (6, object): ['too young' < 'young' < 'young adult' < 'adult' < 'old' < 'too old']


### One-Hot Encode to age_group variable

In [69]:
data =pd.get_dummies(data, columns = ['age_group'])
print(data.head())

   Unnamed: 0 first_name  last_name  birth_year  voted  num_children  \
0           0     Denise      Ratke        2005  False             0   
1           1       Hali  Cummerata        1987  False             0   
2           2    Salomon        Orn        1992   True             2   
3           3     Sarina   Schiller        1965  False             2   
4           4       Gust  Abernathy        1945  False             2   

   income_year  higher_tax  marital_status_divorced  marital_status_married  \
0     92129.41           1                        0                       0   
1     75649.17           2                        1                       0   
2    166313.45           3                        0                       0   
3     71704.81           4                        0                       1   
4    143316.08           3                        0                       1   

   marital_status_single  marital_status_widowed  age_group_too young  \
0                  