# SVR Machine Learning with Python

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Importing dataset

In [91]:
df = pd.read_csv('../white_house_2017_salaries.csv')

In [92]:
# dimensions of the dataset
df.shape

(377, 5)

In [93]:
# showing the columns
# names are upper and some got space
df.columns

Index(['NAME', 'STATUS', 'SALARY', 'PAY BASIS', 'POSITION TITLE'], dtype='object')

Trying to remove space between the words 'PAY BASIS' and 'POSITION TITLE'

In [94]:
# Let us take a look how Python read this
df.columns[3] # index of 'PAY BASIS'

'PAY\xa0BASIS'

In [95]:
df.columns[4] # index of 'POSITION TITLE'

'POSITION\xa0TITLE'

In [96]:
# now we shoul write the same name with the rename function
df = df.rename(columns={'PAY\xa0BASIS': 'PAY_BASIS'})
df = df.rename(columns={'POSITION\xa0TITLE': 'POSITION_TITLE'})

In [97]:
df.columns

Index(['NAME', 'STATUS', 'SALARY', 'PAY_BASIS', 'POSITION_TITLE'], dtype='object')

In [98]:
# all the type are object, we need to normalize
df.dtypes

NAME              object
STATUS            object
SALARY            object
PAY_BASIS         object
POSITION_TITLE    object
dtype: object

In [99]:
# our dataset first 10 lines
df.head()

Unnamed: 0,NAME,STATUS,SALARY,PAY_BASIS,POSITION_TITLE
0,"Alexander, Monica K.",Employee,"$56,000.00",Per Annum,EXECUTIVE ASSISTANT
1,"Ambrosini, Michael J.",Employee,"$95,000.00",Per Annum,SPECIAL ASSISTANT TO THE PRESIDENT AND DIRECTO...
2,"Amin, Stacy C.",Employee,"$140,000.00",Per Annum,SPECIAL ASSISTANT TO THE PRESIDENT AND ASSOCIA...
3,"Andersen, Whitney N.",Employee,"$94,000.00",Per Annum,DEPUTY DIRECTOR OF OPERATIONS FOR THE WHITE HO...
4,"Anderson, Alexander J.",Employee,"$77,000.00",Per Annum,DIRECTOR OF DIGITAL ENGAGEMENT


## Data Cleaning

In [100]:
# Removing the '$' and comma ',' and converting into the column 'SALARY' 'float'
df['SALARY'] = df['SALARY'].str.replace('$', '')
df['SALARY'] = df['SALARY'].str.replace(',', '')
df['SALARY'] = df['SALARY'].astype('float')

  df['SALARY'] = df['SALARY'].str.replace('$', '')


In [101]:
# not necessary but useful, we set the name into title
df.columns = df.columns.str.title()

In [102]:
# now we see the changes
df.head()

Unnamed: 0,Name,Status,Salary,Pay_Basis,Position_Title
0,"Alexander, Monica K.",Employee,56000.0,Per Annum,EXECUTIVE ASSISTANT
1,"Ambrosini, Michael J.",Employee,95000.0,Per Annum,SPECIAL ASSISTANT TO THE PRESIDENT AND DIRECTO...
2,"Amin, Stacy C.",Employee,140000.0,Per Annum,SPECIAL ASSISTANT TO THE PRESIDENT AND ASSOCIA...
3,"Andersen, Whitney N.",Employee,94000.0,Per Annum,DEPUTY DIRECTOR OF OPERATIONS FOR THE WHITE HO...
4,"Anderson, Alexander J.",Employee,77000.0,Per Annum,DIRECTOR OF DIGITAL ENGAGEMENT


## Data Training

In [103]:
df['Status'].value_counts()

Employee           358
Detailee            18
Employee             1
Name: Status, dtype: int64

In [121]:
# trying to convert the categorical value into integer
df['Status'] = df['Status'].replace('Employee', 1)
df['Status'] = df['Status'].replace('Detailee', 2)
df['Status'] = df['Status'].replace('Employee       ', 1)# there is only one like this

In [122]:
df['Status']

0      1
1      1
2      1
3      1
4      1
      ..
372    1
373    1
374    1
375    1
376    1
Name: Status, Length: 377, dtype: int64

In [123]:
df['Status'].value_counts()

1    359
2     18
Name: Status, dtype: int64

In [130]:
# this function is to find which value was categorical
for v in df['Status']:
    if v !=  1:
        index = df['Status'].index.get_loc(v)
        print(index)

2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2


In [127]:
df['Pay_Basis'].astype(str)

0      Per Annum
1      Per Annum
2      Per Annum
3      Per Annum
4      Per Annum
         ...    
372    Per Annum
373    Per Annum
374    Per Annum
375    Per Annum
376    Per Annum
Name: Pay_Basis, Length: 377, dtype: object

1
