![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Lab | Revisiting Machine Learning Case Study

- In this lab, you will use `learningSet.csv` file which you already have cloned in today's activities. 

### Instructions

Complete the following steps on the categorical columns in the dataset:

- Check for null values in all the columns
- Exclude the following variables by looking at the definitions. Create a new empty list called `drop_list`. We will append this list and then drop all the columns in this list later:
    - `OSOURCE` - symbol definitions not provided, too many categories
    - `ZIP CODE` - we are including state already
- Identify columns that over 85% missing values
- Remove those columns from the dataframe
- Reduce the number of categories in the column `GENDER`. The column should only have either "M" for males, "F" for females, and "other" for all the rest
    - Note that there are a few null values in the column. We will first replace those null values using the code below:

    ```python
    print(categorical['GENDER'].value_counts())
    categorical['GENDER'] = categorical['GENDER'].fillna('F')
    ```





In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('categorical.csv',index_col=0)

In [3]:
data

Unnamed: 0,OSOURCE,STATE,ZIP,MAILCODE,NOEXCH,MDMAUD,DOMAIN,CLUSTER,HOMEOWNR,GENDER,...,RFA_12,RFA_14,RFA_16,RFA_18,RFA_2R,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,GEOCODE2
0,GRI,IL,61081,A,0,XXXX,T2,36.0,,F,...,S4E,S4E,S4E,S4E,L,E,X,X,X,C
1,BOA,CA,91326,A,0,XXXX,S1,14.0,H,M,...,A1E,,L1E,,L,G,X,X,X,A
2,AMH,NC,27017,A,0,XXXX,R2,43.0,U,M,...,S4F,S4F,S4F,S4D,L,E,X,X,X,C
3,BRY,CA,95953,A,0,XXXX,R2,44.0,U,F,...,S4E,S4E,S4E,S2D,L,E,X,X,X,C
4,,FL,33176,A,0,XXXX,S2,16.0,H,F,...,A1E,L3D,L3D,A2D,L,F,X,X,X,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,ASE,OTHER,99504,A,0,XXXX,C2,27.0,,M,...,,,,,L,G,X,X,X,C
95408,DCD,TX,77379,A,0,XXXX,C1,24.0,H,M,...,,,,,L,F,X,X,X,A
95409,MBC,MI,48910,A,0,XXXX,C3,30.0,,M,...,N3E,N3E,F1D,F1D,L,E,X,X,X,B
95410,PRV,CA,91320,A,0,XXXX,C1,24.0,H,F,...,S4F,S4F,S3F,S2F,L,F,X,X,X,A


In [4]:
data.columns

Index(['OSOURCE', 'STATE', 'ZIP', 'MAILCODE', 'NOEXCH', 'MDMAUD', 'DOMAIN',
       'CLUSTER', 'HOMEOWNR', 'GENDER', 'DATASRCE', 'SOLIH', 'VETERANS',
       'RFA_2', 'RFA_3', 'RFA_4', 'RFA_6', 'RFA_7', 'RFA_8', 'RFA_9', 'RFA_11',
       'RFA_12', 'RFA_14', 'RFA_16', 'RFA_18', 'RFA_2R', 'RFA_2A', 'MDMAUD_R',
       'MDMAUD_F', 'MDMAUD_A', 'GEOCODE2'],
      dtype='object')

In [5]:
variables = ['OSOURCE', 'ZIP']
drop_list = []
for variable in variables:
    drop_list.append(variable)
data.drop(columns=drop_list, inplace=True)
print(data.columns)

Index(['STATE', 'MAILCODE', 'NOEXCH', 'MDMAUD', 'DOMAIN', 'CLUSTER',
       'HOMEOWNR', 'GENDER', 'DATASRCE', 'SOLIH', 'VETERANS', 'RFA_2', 'RFA_3',
       'RFA_4', 'RFA_6', 'RFA_7', 'RFA_8', 'RFA_9', 'RFA_11', 'RFA_12',
       'RFA_14', 'RFA_16', 'RFA_18', 'RFA_2R', 'RFA_2A', 'MDMAUD_R',
       'MDMAUD_F', 'MDMAUD_A', 'GEOCODE2'],
      dtype='object')


In [6]:
pd.set_option('display.max_rows',None)
nulls_df = pd.DataFrame(data.isna().sum()/len(data))
nulls_df.columns = ['nulls_percentage']
cols_to_drop = nulls_df[nulls_df['nulls_percentage']>.85].sort_values(by=['nulls_percentage'],ascending=False).index

In [7]:
cols_to_drop

Index(['SOLIH', 'VETERANS'], dtype='object')

In [8]:
cols = ['SOLIH', 'VETERANS']
cols_to_drop = list(cols_to_drop)
for item in cols:
    cols_to_drop.remove(item)

data = data.drop(columns= cols_to_drop)
print(data.shape)
data.head()

(95412, 29)


Unnamed: 0,STATE,MAILCODE,NOEXCH,MDMAUD,DOMAIN,CLUSTER,HOMEOWNR,GENDER,DATASRCE,SOLIH,...,RFA_12,RFA_14,RFA_16,RFA_18,RFA_2R,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,GEOCODE2
0,IL,A,0,XXXX,T2,36.0,,F,,,...,S4E,S4E,S4E,S4E,L,E,X,X,X,C
1,CA,A,0,XXXX,S1,14.0,H,M,3.0,,...,A1E,,L1E,,L,G,X,X,X,A
2,NC,A,0,XXXX,R2,43.0,U,M,3.0,,...,S4F,S4F,S4F,S4D,L,E,X,X,X,C
3,CA,A,0,XXXX,R2,44.0,U,F,3.0,,...,S4E,S4E,S4E,S2D,L,E,X,X,X,C
4,FL,A,0,XXXX,S2,16.0,H,F,3.0,12.0,...,A1E,L3D,L3D,A2D,L,F,X,X,X,A


In [9]:
print(data['GENDER'].value_counts())
data['GENDER'] = data['GENDER'].fillna('F')

F    51277
M    39094
U     1715
J      365
C        2
A        2
Name: GENDER, dtype: int64


In [10]:
print(data['GENDER'].value_counts())

F    54234
M    39094
U     1715
J      365
C        2
A        2
Name: GENDER, dtype: int64


In [11]:
data['GENDER'].replace(['F', 'M' ,'U', 'J', 'C', 'A'], ['F', 'M', 'other', 'other', 'other', 'other'], inplace=True)

In [12]:
print(data['GENDER'].value_counts())

F        54234
M        39094
other     2084
Name: GENDER, dtype: int64
