In [1]:
import pandas as pd
import requests
import io
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import csv
import requests
import io


from numpy import array
from numpy import argmax
from io import TextIOWrapper
from zipfile import ZipFile

In [2]:
####################### STEP 1: DATA INTRODUCTION ######################

# Have already downloaded the dataset from UCI Machine Learning Repository
# and have uploaded the raw file under this project's repository ('Census-Data').

url = "https://raw.githubusercontent.com/jasmultani5391/Census-Data/master/rawdatatest.txt"
download = requests.get(url).content

# Read the downloaded content and turn it into a pandas dataframe

df1 = pd.read_csv(io.StringIO(download.decode('utf-8')),
                 sep=" ",
                 names=['age',
                        'workclass',
                        'fnlwgt',
                        'education',
                        'educationnum',
                        'maritalstatus',
                        'occupation',
                        'relationship',
                        'race',
                        'sex',
                        'capitalgain',
                        'capitalloss',
                        'hoursperweek',
                        'ogcountry',
                        'salaryrange'
                       ],
                 delimiter=","
                )

# Print to get a sample of the dataset you'll be working with.
print(df1.head(5))

   age   workclass  fnlwgt      education  educationnum        maritalstatus  \
0   25     Private  226802           11th             7        Never-married   
1   38     Private   89814        HS-grad             9   Married-civ-spouse   
2   28   Local-gov  336951     Assoc-acdm            12   Married-civ-spouse   
3   44     Private  160323   Some-college            10   Married-civ-spouse   
4   18           ?  103497   Some-college            10        Never-married   

           occupation relationship    race      sex  capitalgain  capitalloss  \
0   Machine-op-inspct    Own-child   Black     Male            0            0   
1     Farming-fishing      Husband   White     Male            0            0   
2     Protective-serv      Husband   White     Male            0            0   
3   Machine-op-inspct      Husband   Black     Male         7688            0   
4                   ?    Own-child   White   Female            0            0   

   hoursperweek       ogcountry 

In [3]:
class QualToQuant():
    # Using this class to clean up data. Qualitative to quantitative.
    ''' This method applies one-hot encoding across columns
    that have qualitative information and concat them within
    one dataframe.
    '''
    
    def onehotenc_vals(self, columns):
        df2 = pd.DataFrame(columns=[])
        for i in columns:
            a = pd.get_dummies(df1[i], prefix=i)
            df2 = pd.concat([df1, a],
                            axis=1
                           )
        return df2
                
    ''' The following method will allow us to normalize the range of
    datapoints between 0 and 1.
    '''
    def normalizer(self, data):
        float_list = []
        for i in data:
            age = float(i)
            float_list.append(age)
        minimum = min(float_list)
        maximum = max(float_list)
        normalized_list = []
        for i in float_list:
            normalized_list += [(i-minimum)
                                / (maximum-minimum)]
        return normalized_list
    
    ''' The following method is to convert the qualitative info of whether
    individual is male or female into a binary. This will be set up so
    that if the indivudal is a female, the information will be denoted
    as '1'.
    '''
    def gender_labeler(self, data):
        gender_list = []
        for i in data:
            if 'Female' in i:
                gender_list.append(1)
            else:
                gender_list.append(0)
        return gender_list

    ''' I am most interested in understanding whether being an immigrant
    vs. a native of the U.S. would influence the prediction. If the
    person is born in the U.S., the info will be denoted as '1'.
    '''
    def ogcountry_labeler(self, data):
        ogcountry_list = []
        for i in data:
            if 'United-States' in i:
                ogcountry_list.append(1)
            else:
                ogcountry_list.append(0)
        return ogcountry_list

    ''' The following method will label whether the individual makes less
    than or equal to $50,000 per year, or more. The former will be
    denoted as '0' and the latter as '1'. This will later be used as
    the label to train the algorithm to split up the indivudals into
    either classes.
    '''
    def salary_labeler(self, data):
        salary_list = []
        for i in data:
            if '<=50K' in i:
                salary_list.append(0)
            else:
                salary_list.append(1)
        return salary_list

In [4]:
#################### STEP 2: DATA CLEANUP - EXPLORE ####################

# In order to understand how qualitative info can be switched to
# quantitative info, I first want to know the unique values in each
# column.

workclass_count = df1['workclass'].value_counts()
education_count = df1['education'].value_counts()
maritalstatus_count = df1['maritalstatus'].value_counts()
occupation_count = df1['occupation'].value_counts()
race_count = df1['race'].value_counts()

# Uncomment to print unique values.
#print(workclass_count,
#      education_count,
#      maritalstatus_count,
#      occupation_count,
#      race_count
#      )

# From this last print, I found out that there are 10 rows where there
# are indivudals who are without pay or have not worked. Because our goal
# is to understand which  the socioeconomic factors of those who make
# salary, I will have to drop these rows.
df1.workclass = df1['workclass'].replace('Without-pay',
                                         np.NaN,
                                         regex=True
                                         )
df1.workclass = df1['workclass'].replace('Never-worked',
                                         np.NaN,
                                         regex=True
                                         )
df1.dropna()

# From these last prints, I find out that 'workclass' and
# 'occupation' have datapoints with '?' being used.
# For now, I'll replace '?' with a 'NaN'.
df1.workclass = df1['workclass'].replace('[\?,]',
                                         np.NaN,
                                         regex=True
                                         )
df1.occupation = df1['occupation'].replace('[\?,]',
                                           np.NaN,
                                           regex=True
                                           )

# Print value count of workclass and occupation to make sure NaN
# decisions worked.
# print(df1['workclass'].value_counts(),
#      df1['occupation'].value_counts()
#     )

# Turn qualitative datapoints into matrix of binaries through "pd.get_dummies"
qualcol = ['workclass',
           'race',
           'occupation',
           'maritalstatus',
           'relationship'
           ]

df2 = pd.DataFrame(columns=[])
for i in qualcol:
    df2 = pd.concat([df2, pd.get_dummies(df1[i], prefix=i)],
                    axis=1
                   )
# Uncomment to print
#print(df2.columns)

In [5]:
######################### STEP 3: FEATURE ENGINEERING ########################
# Now that data has been properly cleaned and converted to numbers, I want to
# decide which features should be included when training the algorithms.

# This is the label that we are trying to predict. More than 50K is 1.
# Less than 50K is 0.
labelDF = pd.DataFrame(columns=['salary_label'])

# Normalize data that have a range of numbers.
cleanup = QualToQuant()
df2['norm_age'] = cleanup.normalizer(df1['age'])
df2['norm_edunum'] = cleanup.normalizer(df1['educationnum'])
df2['norm_capitalgain'] = cleanup.normalizer(df1['capitalgain'])
df2['norm_capitalloss'] = cleanup.normalizer(df1['capitalloss'])
df2['norm_hoursperweek'] = cleanup.normalizer(df1['hoursperweek'])

# Change columns that have two values into 0 or 1.
# Between "male" and "female", "female" is 1.
# Between ">50k" and "<=50k", ">50k" is 1.
df2['gender_label'] = cleanup.gender_labeler(df1['sex'])
labelDF['salary_label'] = cleanup.salary_labeler(df1['salaryrange'])
df2['salary_label'] = cleanup.salary_labeler(df1['salaryrange'])

# For country origin, I split it up values up into American-born vs not
# American-born, to simplify things. "American-born" is 1.
df2['ogcountry_label'] = cleanup.ogcountry_labeler(df1['ogcountry'])

# FeatDF contains all the original columns, and one-hot encoded columns, and
# normalized datapoints. I wanted to rename df2 as featDF for easier memory.
featDF = df2

# This completeDF will allow us to look at the original columns (dropped from
# featuresDF).
# Took out 'fnlwgt' and 'education' in featDF. Took out 'fnlwgt' in completeDF.

completeDF = df1[['age',
                  'workclass',
                  'education',
                  'educationnum',
                  'maritalstatus',
                  'occupation',
                  'relationship',
                  'race',
                  'sex',
                  'capitalgain',
                  'capitalloss',
                  'hoursperweek',
                  'ogcountry',
                  'salaryrange'
                  ]]


# Uncomment below to compare the difference between completeDF and featDF.
#print(completeDF.head(5))
#print('\n\n')
#print(featDF.head(5))


# Uncomment and run the next two lines to save dataframes as csv files.
# These raw files are also found in the Github repository I have posted. 
#completeDF.to_csv('completeDF.csv',index=False)
#featDF.to_csv('featDF.csv',index=False)