# DAT210x - Programming with Python for DS

## Module2 - Lab5

Import and alias Pandas:

In [74]:
import pandas as pd

As per usual, load up the specified dataset, setting appropriate header labels.

In [75]:
col_names = ['education', 'age', 'capital-gain', 'race', 'capital-loss', 'hours-per-week', 'sex', 'classification']
df = pd.read_csv('./Datasets/census.data', index_col=0) # first column is index already in data
df.columns = col_names;
print('shape=', df.shape)
df.head()

shape= (29535, 8)


Unnamed: 0_level_0,education,age,capital-gain,race,capital-loss,hours-per-week,sex,classification
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Bachelors,50,?,White,0,13,Male,<=50K
2,HS-grad,38,?,White,0,40,Male,<=50K
3,11th,53,?,Black,0,40,Male,<=50K
4,Bachelors,28,0,Black,0,40,Female,<=50K
5,Masters,37,0,White,0,40,Female,<=50K


Excellent.

Now, use basic pandas commands to look through the dataset. Get a feel for it before proceeding!

Do the data-types of each column reflect the values you see when you look through the data using a text editor / spread sheet program? If you see `object` where you expect to see `int32` or `float64`, that is a good indicator that there might be a string or missing value or erroneous value in the column.

In [76]:
df.loc[:,'capital-gain'] = pd.to_numeric(df.loc[:,'capital-gain'], errors='coerce')
df.dtypes

education          object
age                 int64
capital-gain      float64
race               object
capital-loss        int64
hours-per-week      int64
sex                object
classification     object
dtype: object

Try use `your_data_frame['your_column'].unique()` or equally, `your_data_frame.your_column.unique()` to see the unique values of each column and identify the rogue values.

If you find any value that should be properly encoded to NaNs, you can convert them either using the `na_values` parameter when loading the dataframe. Or alternatively, use one of the other methods discussed in the reading.

In [77]:
df['capital-gain'] = df['capital-gain'].fillna(0)
df.head(10)

Unnamed: 0_level_0,education,age,capital-gain,race,capital-loss,hours-per-week,sex,classification
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Bachelors,50,0.0,White,0,13,Male,<=50K
2,HS-grad,38,0.0,White,0,40,Male,<=50K
3,11th,53,0.0,Black,0,40,Male,<=50K
4,Bachelors,28,0.0,Black,0,40,Female,<=50K
5,Masters,37,0.0,White,0,40,Female,<=50K
6,9th,49,0.0,Black,0,16,Female,<=50K
7,HS-grad,52,0.0,White,0,45,Male,>50K
8,Masters,31,14084.0,White,0,50,Female,>50K
9,Bachelors,42,5178.0,White,0,40,Male,>50K
10,Some-college,37,0.0,Black,0,80,Male,>50K


Look through your data and identify any potential categorical features. Ensure you properly encode any ordinal and nominal types using the methods discussed in the chapter.

Be careful! Some features can be represented as either categorical or continuous (numerical). If you ever get confused, think to yourself what makes more sense generally---to represent such features with a continuous numeric type... or a series of categories?

In [78]:
from pandas.api.types import CategoricalDtype

ordered_class=['<=50K','>50K']
t = CategoricalDtype(categories=ordered_class, ordered=True)
df['classification'] = df.classification.astype(dtype=t).cat.codes
df.head(10)

Unnamed: 0_level_0,education,age,capital-gain,race,capital-loss,hours-per-week,sex,classification
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Bachelors,50,0.0,White,0,13,Male,0
2,HS-grad,38,0.0,White,0,40,Male,0
3,11th,53,0.0,Black,0,40,Male,0
4,Bachelors,28,0.0,Black,0,40,Female,0
5,Masters,37,0.0,White,0,40,Female,0
6,9th,49,0.0,Black,0,16,Female,0
7,HS-grad,52,0.0,White,0,45,Male,1
8,Masters,31,14084.0,White,0,50,Female,1
9,Bachelors,42,5178.0,White,0,40,Male,1
10,Some-college,37,0.0,Black,0,80,Male,1


In [79]:
# sex = Male/Femaale into two binary columns 
df = pd.get_dummies(df, columns=['sex'])
df.head()

Unnamed: 0_level_0,education,age,capital-gain,race,capital-loss,hours-per-week,classification,sex_Female,sex_Male
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Bachelors,50,0.0,White,0,13,0,0,1
2,HS-grad,38,0.0,White,0,40,0,0,1
3,11th,53,0.0,Black,0,40,0,0,1
4,Bachelors,28,0.0,Black,0,40,0,1,0
5,Masters,37,0.0,White,0,40,0,1,0


In [80]:
# education is ordinal categorical 
edu_order = ['Preschool', '1st-4th', '5th-6th', '7th-8th','9th','10th','11th','12th','HS-grad','Some-college','Bachelors','Masters','Doctorate']
t = CategoricalDtype(categories=edu_order, ordered=True)
df['education'] = df.education.astype(dtype=t).cat.codes

# race is nominal categorical (no order)
df['race'] = df.race.astype('category').cat.codes

df.dtypes

education            int8
age                 int64
capital-gain      float64
race                 int8
capital-loss        int64
hours-per-week      int64
classification       int8
sex_Female          uint8
sex_Male            uint8
dtype: object

Lastly, print out your dataframe!

In [83]:
df

Unnamed: 0_level_0,education,age,capital-gain,race,capital-loss,hours-per-week,classification,sex_Female,sex_Male
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,10,50,0.0,4,0,13,0,0,1
2,8,38,0.0,4,0,40,0,0,1
3,6,53,0.0,2,0,40,0,0,1
4,10,28,0.0,2,0,40,0,1,0
5,11,37,0.0,4,0,40,0,1,0
6,4,49,0.0,2,0,16,0,1,0
7,8,52,0.0,4,0,45,1,0,1
8,11,31,14084.0,4,0,50,1,1,0
9,10,42,5178.0,4,0,40,1,0,1
10,9,37,0.0,2,0,80,1,0,1
