#### Import libraries

In [1]:
import pandas as pd
import numpy as np

#### Reading the data

In [3]:
df = pd.read_csv('../data/test_data.csv')

In [4]:
df.shape

(16281, 13)

In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Male,7688,0,40,United-States
4,18,?,103497,Some-college,10,Never-married,?,Own-child,Female,0,0,30,United-States


In [6]:
#checking if the data has the correct data type
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 13 columns):
age               16281 non-null int64
workclass         16281 non-null object
fnlwgt            16281 non-null int64
education         16281 non-null object
education-num     16281 non-null int64
marital-status    16281 non-null object
occupation        16281 non-null object
relationship      16281 non-null object
sex               16281 non-null object
capital-gain      16281 non-null int64
capital-loss      16281 non-null int64
hours-per-week    16281 non-null int64
native-country    16281 non-null object
dtypes: int64(6), object(7)
memory usage: 1.6+ MB


In [7]:
#checking out of range age.
df.age.describe()

count    16281.000000
mean        38.767459
std         13.849187
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: age, dtype: float64

#### EDA

In [8]:
#Checking for null values
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64

No nulls in any of the columns

In [9]:
#checking for the uniques in workclass
df.workclass.value_counts()

 Private             11210
 Self-emp-not-inc     1321
 Local-gov            1043
 ?                     963
 State-gov             683
 Self-emp-inc          579
 Federal-gov           472
 Without-pay             7
 Never-worked            3
Name: workclass, dtype: int64

In [10]:
#checking for the uniques in education
df.education.value_counts()

 HS-grad         5283
 Some-college    3587
 Bachelors       2670
 Masters          934
 Assoc-voc        679
 11th             637
 Assoc-acdm       534
 10th             456
 7th-8th          309
 Prof-school      258
 9th              242
 12th             224
 Doctorate        181
 5th-6th          176
 1st-4th           79
 Preschool         32
Name: education, dtype: int64

In [11]:
#checking for unique and out of range values
df['education-num'].value_counts(ascending = True)

1       32
2       79
3      176
16     181
8      224
5      242
15     258
4      309
6      456
12     534
7      637
11     679
14     934
13    2670
10    3587
9     5283
Name: education-num, dtype: int64

In [12]:
#checking for the uniques in marital-status
df['marital-status'].value_counts()

 Married-civ-spouse       7403
 Never-married            5434
 Divorced                 2190
 Widowed                   525
 Separated                 505
 Married-spouse-absent     210
 Married-AF-spouse          14
Name: marital-status, dtype: int64

In [13]:
#checking for the uniques in occupation
df.occupation.value_counts()

 Prof-specialty       2032
 Exec-managerial      2020
 Craft-repair         2013
 Sales                1854
 Adm-clerical         1841
 Other-service        1628
 Machine-op-inspct    1020
 ?                     966
 Transport-moving      758
 Handlers-cleaners     702
 Tech-support          518
 Farming-fishing       496
 Protective-serv       334
 Priv-house-serv        93
 Armed-Forces            6
Name: occupation, dtype: int64

In [14]:
#checking for the uniques in relationship
df.relationship.value_counts()

 Husband           6523
 Not-in-family     4278
 Own-child         2513
 Unmarried         1679
 Wife               763
 Other-relative     525
Name: relationship, dtype: int64

In [15]:
#checking for the uniques in sex
df.sex.value_counts()

 Male      10860
 Female     5421
Name: sex, dtype: int64

In [16]:
df['capital-gain'].value_counts().head()

0        14958
15024      166
7688       126
7298       118
99999       85
Name: capital-gain, dtype: int64

In [17]:
df['capital-loss'].value_counts().head()

0       15518
1902      102
1977       85
1887       74
2415       23
Name: capital-loss, dtype: int64

In [18]:
df['hours-per-week'].value_counts()

40    7586
50    1427
45     893
60     702
35     640
      ... 
73       2
76       1
89       1
69       1
79       1
Name: hours-per-week, Length: 89, dtype: int64

In [19]:
#checking for the uniques in native-country
df['native-country'].value_counts()

 United-States                 14662
 Mexico                          308
 ?                               274
 Philippines                      97
 Puerto-Rico                      70
 Germany                          69
 Canada                           61
 India                            51
 El-Salvador                      49
 China                            47
 Cuba                             43
 England                          37
 South                            35
 Dominican-Republic               33
 Italy                            32
 Haiti                            31
 Portugal                         30
 Japan                            30
 Poland                           27
 Columbia                         26
 Jamaica                          25
 Guatemala                        24
 Greece                           20
 Vietnam                          19
 Ecuador                          17
 Iran                             16
 Peru                             15
 

There is an extra space behind each of the instances in our data. we will remove this space

In [20]:
#removes any spaces before or after the instances in all of our dataFrame
df = df.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

In the columns occupation, workclass and native-country, we have a '?'. We will replace it with 'Others' for each of these columns

In [21]:
df['occupation'].replace('?', 'Other', inplace=True)

In [22]:
df['workclass'].replace('?', 'Other', inplace=True)

In [23]:
df['native-country'].replace('?', 'Other', inplace=True)

Saving the cleaned training file:

In [25]:
df.to_csv('../data/test_clean.csv', index=False)