# Predicting Income with Random Forests

![title](image/sharon-mccutcheon-665638-unsplash.jpg)

Photo by Sharon Mccutcheon

## UCI Data Set

In [1]:
# https://archive.ics.uci.edu/ml/datasets/census+income

## Import Data Set

In [2]:
import pandas as pd

## Create Columns Indexes

In [10]:
index = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

In [27]:
df_adult_data = pd.read_csv("adult_data.csv", delimiter=", ", names=index)

  """Entry point for launching an IPython kernel.


## Investigate The Data  - Exploratory Data Analysis

In [37]:
df_adult_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
df_adult_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [38]:
print(df_adult_data.iloc[0])

age                          39
workclass             State-gov
fnlwgt                    77516
education             Bachelors
education-num                13
marital-status    Never-married
occupation         Adm-clerical
relationship      Not-in-family
race                      White
sex                        Male
capital-gain               2174
capital-loss                  0
hours-per-week               40
native-country    United-States
income                    <=50K
Name: 0, dtype: object


## Changing Column Types

In [46]:
# Add sex-int colum 0: Male, 1: Female
df_adult_data["sex-int"] = df_adult_data["sex"].apply(lambda row: 0 if row == "Male" else 1)

In [47]:
print(df_adult_data.iloc[0])

age                          39
workclass             State-gov
fnlwgt                    77516
education             Bachelors
education-num                13
marital-status    Never-married
occupation         Adm-clerical
relationship      Not-in-family
race                      White
sex                        Male
capital-gain               2174
capital-loss                  0
hours-per-week               40
native-country    United-States
income                    <=50K
sex-int                       0
Name: 0, dtype: object


In [52]:
# Country
df_adult_data["native-country"].value_counts()

United-States                 29170
Mexico                          643
?                               583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
Greece                      

Since the majority of the data comes from "United-States", it might make sense to make a column where every row that contains "United-States" becomes a 0 and any other country becomes a 1.

In [53]:
df_adult_data["country-int"] = df_adult_data["native-country"].apply(lambda row: 0 if row == "United-States" else 1)

In [54]:
print(df_adult_data.iloc[0])

age                          39
workclass             State-gov
fnlwgt                    77516
education             Bachelors
education-num                13
marital-status    Never-married
occupation         Adm-clerical
relationship      Not-in-family
race                      White
sex                        Male
capital-gain               2174
capital-loss                  0
hours-per-week               40
native-country    United-States
income                    <=50K
sex-int                       0
country-int                   0
Name: 0, dtype: object


## Scikit-learn

### Income Labels

In [58]:
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [59]:
## the labels are in the column called "income". We want to grab only this column
labels = df_adult_data[['income']]

In [60]:
print(labels.head())

  income
0  <=50K
1  <=50K
2  <=50K
3  <=50K
4  <=50K


### Features

In [61]:
features = df_adult_data[['age', 'capital-gain', 'capital-loss', 'hours-per-week', 'sex-int', "country-int"]]

### Split our data and labels into a training set and a test set

In [63]:
train_data, test_data, train_labels, test_labels = train_test_split(features, labels, random_state = 1)

## Create The Random Forest

In [65]:
# Create model
forest = RandomForestClassifier(random_state = 1)
# Fit model
forest.fit(train_data, train_labels)
# Test accuracy of the model
print("Score of the Random Forest:", forest.score(test_data, test_labels))

  after removing the cwd from sys.path.


Score of the Random Forest: 0.823731728288908
