# Project wAIge

[Project description]

## Dataset

The dataset that will be used is taken from the Family Income and Expenditure Survey (2017)

[More descriptions of what features from the dataset will be used]

## Decision Tree

### Libraries

This section contains the libraries that will be used for this project

In [1]:
import pandas as pd

### Data Preprocessing

Before proceeding any further, this section alters the dataset to contain only the necessary information that will be used

In [16]:
# The variable df will contain the altered dataset
df = pd.read_csv("../data/project_data.csv")

# This section removes the unnecessary columns
df = df[["Main Source of Income", "Household Head Sex", "Household Head Age", "Household Head Marital Status", "Household Head Highest Grade Completed", "Household Head Occupation", "Household Head Class of Worker", "Total Household Income"]]

# This section removes rows that ...
# ...contain "NA" under "Household Head Occupation", ...
df = df[df["Household Head Occupation"].notnull()]
# ...is not "Wage/Salaries" under "Main Source of Income" (also drop "Main Source of Income" afterwards), ...
df = df[df["Main Source of Income"] == "Wage/Salaries"]
df.pop("Main Source of Income")
# ... and is "Unknown" under "Household Head Marital Status"
df = df[df["Household Head Marital Status"] != "Unknown"]

# This section converts the total yearly income to monthly income...
df["Total Household Income"] = df["Total Household Income"].div(12).round(0)
df["Total Household Income"] = [int(x) for x in df["Total Household Income"]]
# ... and mask it based on our target predictions:
#       1 for monthly salary < 19000
#       2 for 19000 <= monthly salary <= 110000
#       3 for monthly salary > 110000
df["Total Household Income"].mask(df["Total Household Income"] > 110000, 3, inplace=True)
df["Total Household Income"].mask(df["Total Household Income"] >= 19000 , 2, inplace=True)
df["Total Household Income"].mask(df["Total Household Income"] < 19000, 1, inplace=True)

# This section renames the remaining columns
df.columns = ["sex", "age", "marital_status", "educational_attainment", "occupation", "work_class", "target"]

# Mask age
# -- write code here --

# Mask "marital_status" by reclassifying some values to "Single"
# print(df["marital_status"].unique()) # Uncomment this line to see the original values
df["marital_status"].mask(df["marital_status"] == "Widowed" , "Single", inplace=True)
df["marital_status"].mask(df["marital_status"] == "Divorced/Separated" , "Single", inplace=True)
df["marital_status"].mask(df["marital_status"] == "Annulled" , "Single", inplace=True)

# Mask "educational_attainment" by classifying the values to the following:
#       1 Less than High School
#       2 for High School Graduate
#       3 for Some College
#       4 for Bachelor's Degree
#       5 for Graduate Degree
# print(df["educational_attainment"].unique()) # Uncomment this line to see the original values
df["educational_attainment"].mask(df["educational_attainment"] == "No Grade Completed" , 1, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Preschool" , 1, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Grade 1" , 1, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Grade 2" , 1, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Grade 3" , 1, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Grade 4" , 1, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Grade 5" , 1, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Grade 6" , 1, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Elementary Graduate" , 1, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "First Year High School" , 1, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Second Year High School" , 1, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Third Year High School" , 1, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "High School Graduate" , 2, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "First Year Post Secondary" , 3, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "First Year College" , 3, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Second Year Post Secondary" , 3, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Second Year College" , 3, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Third Year College" , 3, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Fourth Year College" , 3, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Other Programs in Education at the Third Level, First Stage, of the Type that Leads to an Award not Equivalent to a First University or Baccalaureate Degree" , 3, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Other Programs of Education at the Third Level, First Stage, of the Type that Leads to a Baccalaureate or First University/Professional Degree (HIgher Education Level, First Stage, or Collegiate Education Level)" , 3, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Teacher Training and Education Sciences Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Transport Services Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Business and Administration Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Social and Behavioral Science Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Agriculture, Forestry, and Fishery Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Engineering and Engineering Trades Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Engineering and Engineering trades Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Basic Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Health Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Security Services Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Humanities Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Computing/Information Technology Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Mathematics and Statistics Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Personal Services Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Journalism and Information Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Architecture and Building Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Life Sciences Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Law Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Manufacturing and Processing Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Physical Sciences Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Environmental Protection Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Social Services Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Veterinary Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Arts Programs" , 4, inplace=True)
df["educational_attainment"].mask(df["educational_attainment"] == "Post Baccalaureate" , 5, inplace=True)

# Mask occupation
# There are 364 values to mask
# for i in df["occupation"].unique():
#     print(i)

# Mask "work_class" by reclassifying the values into either "Public" or "Private"
# print(df["work_class"].unique()) # Uncomment this line to see the original values
df["work_class"].mask(df["work_class"] == "Worked for government/government corporation" , "Public", inplace=True)
df["work_class"].mask(df["work_class"] == "Worked for private establishment" , "Private", inplace=True)
df["work_class"].mask(df["work_class"] == "Employer in own family-operated farm or business" , "Private", inplace=True)
df["work_class"].mask(df["work_class"] == "Self-employed wihout any employee" , "Private", inplace=True)
df["work_class"].mask(df["work_class"] == "Worked without pay in own family-operated farm or business" , "Private", inplace=True)
df["work_class"].mask(df["work_class"] == "Worked for private household" , "Private", inplace=True)
df["work_class"].mask(df["work_class"] == "Worked with pay in own family-operated farm or business" , "Private", inplace=True)

# print(df)

['Female' 'Male']


### Modeling the Decision Tree

In [3]:
# -- write code here --