# **Dataiku Data Scientist Technical Assessment**
### Author : Jules Boistard
### Submission date : January 18th, 2022
___

#### **#1 : Exploratory data analysis**

In [1]:
# Read the train and test files
import pandas as pd
import re

metadata = open("data/census_income_metadata.txt")
col_names = [re.findall("\(([^\)]+)\)", line)[0] for line in metadata.readlines()[81:121]]
col_names.append("total person income")

train = pd.read_csv("data/census_income_learn.csv", header=None, sep=", ", na_values="?", usecols=[i for i in range(42) if i!=24], names=col_names)
test = pd.read_csv("data/census_income_test.csv", header=None, sep=", ", na_values="?", usecols=[i for i in range(42) if i!=24], names=col_names)

'''
Note : column #24 in the csv corresponds to the instance weight.
While it will be kept for exploratory data analysis, we will later on remove it to train the classifiers.
It is therefore treated separately in this section and added as last column.
'''
train["instance weight"] = pd.read_csv("data/census_income_learn.csv", header=None, sep=", ", na_values="?", usecols=[24])
test["instance weight"] = pd.read_csv("data/census_income_test.csv", header=None, sep=", ", na_values="?", usecols=[24])

  return func(*args, **kwargs)


In [34]:
# Identify categorical columns in the dataset
num_cols = ["age", "wage per hour", "capital gains", "capital losses", "dividends from stocks", "num persons worked for employer", "weeks worked in year", "instance weight"]
cat_cols = [col for col in train.columns if col not in num_cols]

train[cat_cols] = train[cat_cols].astype("category")
test[cat_cols] = test[cat_cols].astype("category")

data = pd.concat([train, test])

In [39]:
# Check for missing values
nan_rate = data.isna().sum() / len(data)
nan_rate.sort_values(ascending=False).head(10)

migration code-change in msa      0.499998
migration code-change in reg      0.499998
migration code-move within reg    0.499998
migration prev res in sunbelt     0.499998
country of birth father           0.033887
country of birth mother           0.030710
country of birth self             0.017231
hispanic origin                   0.004274
state of previous residence       0.003468
citizenship                       0.000000
dtype: float64

In [36]:
# Look at descriptive stats for numerical features
data.describe()

Unnamed: 0,age,wage per hour,capital gains,capital losses,dividends from stocks,num persons worked for employer,weeks worked in year,instance weight
count,299285.0,299285.0,299285.0,299285.0,299285.0,299285.0,299285.0,299285.0
mean,34.538998,55.105027,431.742176,36.84901,195.851259,1.956172,23.178375,1740.101125
std,22.318468,273.340729,4670.768536,269.789771,1937.847082,2.364857,24.404016,994.144299
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.87
25%,15.0,0.0,0.0,0.0,0.0,0.0,0.0,1060.49
50%,33.0,0.0,0.0,0.0,0.0,1.0,8.0,1617.11
75%,50.0,0.0,0.0,0.0,0.0,4.0,52.0,2188.11
max,90.0,9999.0,99999.0,4608.0,99999.0,6.0,52.0,18656.3


In [37]:
data[data["dividends from stocks"]==99999.000000]

Unnamed: 0,age,class of worker,detailed industry recode,detailed occupation recode,education,wage per hour,enroll in edu inst last wk,marital stat,major industry code,major occupation code,...,country of birth mother,country of birth self,citizenship,own business or self employed,fill inc questionnaire for veteran's admin,veterans benefits,weeks worked in year,year,total person income,instance weight
991,78,Not in universe,0,0,Associates degree-academic program,0,Not in universe,Married-civilian spouse present,Not in universe or children,Not in universe,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,94,50000+.,637.86
8921,69,Not in universe,0,0,Bachelors degree(BA AB BS),0,Not in universe,Married-civilian spouse present,Not in universe or children,Not in universe,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,50000+.,3594.65
13733,38,Self-employed-incorporated,37,5,Bachelors degree(BA AB BS),0,Not in universe,Married-civilian spouse present,Business and repair services,Professional specialty,...,United-States,Germany,Native- Born abroad of American Parent(s),2,Not in universe,2,52,95,50000+.,2618.45
14914,70,Self-employed-incorporated,39,3,Bachelors degree(BA AB BS),0,Not in universe,Married-civilian spouse present,Personal services except private HH,Executive admin and managerial,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,95,50000+.,1326.14
22754,58,Private,32,3,Masters degree(MA MS MEng MEd MSW MBA),0,Not in universe,Married-civilian spouse present,Wholesale trade,Executive admin and managerial,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,95,50000+.,2440.2
33995,58,Not in universe,0,0,High school graduate,0,Not in universe,Married-civilian spouse present,Not in universe or children,Not in universe,...,,,Foreign born- Not a citizen of U S,0,Not in universe,2,44,95,50000+.,2276.34
35946,61,Not in universe,0,0,Masters degree(MA MS MEng MEd MSW MBA),0,Not in universe,Married-civilian spouse present,Not in universe or children,Not in universe,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,4,94,50000+.,1065.3
48458,81,Not in universe,0,0,Bachelors degree(BA AB BS),0,Not in universe,Married-civilian spouse present,Not in universe or children,Not in universe,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,94,50000+.,637.86
66668,66,Self-employed-not incorporated,45,11,Prof school degree (MD DDS DVM LLB JD),0,Not in universe,Married-civilian spouse present,Other professional services,Professional specialty,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,50000+.,1934.72
73491,74,Not in universe,0,0,Some college but no degree,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,94,50000+.,1363.41


In [7]:
train.nunique()

age                                              91
class of worker                                   9
detailed industry recode                         52
detailed occupation recode                       47
education                                        17
wage per hour                                  1240
enroll in edu inst last wk                        3
marital stat                                      7
major industry code                              24
major occupation code                            15
race                                              5
hispanic origin                                   9
sex                                               2
member of a labor union                           3
reason for unemployment                           6
full or part time employment stat                 8
capital gains                                   132
capital losses                                  113
dividends from stocks                          1478
tax filer st

___
#### **#2 : Data preparation**

In [None]:
# Split datasets into features and target variable. Remember that the very last columns corresponds to the instance weight and is therefore ignored
# As such, the target variable corresponds to the second to last column
X_train = train.iloc[:, :-2]
X_test = test.iloc[:, :-2]
y_train = train.iloc[:, -2]
y_test = test.iloc[:, -2]

___
#### **#3 : Data Modeling**

___
#### **#4 : Model Assessment**