In [1]:
# Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') # this is to clear the warnings from this page, typically you would leave them on

In [2]:
# Plot settings
sns.set_context('notebook') # optimise figures for notebook display
sns.set_style('ticks') # set default plot style
colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']
crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB']
sns.set_palette(colours) # set custom color scheme
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)

## Data

The <TT>Employees.csv</TT> file records data about the employees of a company. The dataset is from [Business Analytics for Managers](http://www.springer.com/us/book/9781461404057) by Wolfgang Jank.  In this problem, the managers are concerned with the issue of equal pay in the company. If we compare male and female employees with the same level of experience, are they paid the same on average?

In recent years, workforce, [HR](https://www-01.ibm.com/software/analytics/solutions/operational-analytics/hr-analytics/), and [people analytics](http://knowledge.wharton.upenn.edu/article/open-sourcing-googles-hr-secrets/) have emerged as terms for the practice of data-driven human capital management. 

In [3]:
employee=pd.read_csv('/Users/leannedong/Desktop/ML-DataMiningCourses/Datasets/Employees.csv')
employee.head()

Unnamed: 0,Gender,Experience,Salary
0,Female,15,78200
1,Female,12,66400
2,Female,15,61200
3,Female,3,61000
4,Female,4,60000


In [4]:
employee.tail()

Unnamed: 0,Gender,Experience,Salary
203,Male,39,148000
204,Male,34,190000
205,Male,36,194000
206,Male,32,176000
207,Male,35,188000


## Training and Test Sets

We use the Scikit-Learn train_test_split method to split the data into training and test sets.

Below, we specify that the training set will contain 70% of the data. The random state parameter is an arbitrary number. By setting a specific value for the random state we ensure that we get the same training and test sets if we run the analysis again, even though the split is random.

In [6]:
# splitting the dataset into the source variables (independent variables) and the target variable (dependant variable)
X = employee.iloc[:,:-1]
y = employee['Salary']

In [7]:
# Split the data into 80% training and 20% testing
# The random state allows us to make  the same random split every time
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=327)

In [8]:
print('Training data size (%i,%i)' %X_train.shape)

Training data size (166,2)


In [9]:
print('Testing data size (%i,%i)' %X_test.shape)

Testing data size (42,2)


In [13]:
X_train.head()

Unnamed: 0,Gender,Experience
141,Male,3
200,Male,21
75,Female,9
61,Female,8
160,Male,4


In [14]:
X_test.head()

Unnamed: 0,Gender,Experience
130,Female,8
90,Female,6
112,Female,8
70,Female,12
43,Female,9


In [15]:
y_train.head()

141     58000
200    104000
75      70600
61      68000
160     74000
Name: Salary, dtype: int64

In [12]:
X_train['y_train'] = y_train

In [14]:
X_train.head()

Unnamed: 0,Gender,Experience,y_train
141,Male,3,58000
200,Male,21,104000
75,Female,9,70600
61,Female,8,68000
160,Male,4,74000


In [15]:
X_train.to_csv('/Users/leannedong/Desktop/ML-DataMiningCourses/Datasets/employee_train.csv')

In [16]:
X_test['y_test']=y_test

In [17]:
X_test.head()

Unnamed: 0,Gender,Experience,y_test
130,Female,8,89000
90,Female,6,70000
112,Female,8,78040
70,Female,12,61000
43,Female,9,59600


In [18]:
X_test.to_csv('/Users/leannedong/Desktop/ML-DataMiningCourses/Datasets/employee_test.csv')

In [22]:
X_test.rename(columns = {"y_test" : "word"})

Unnamed: 0,Gender,Experience,word
130,Female,8,89000
90,Female,6,70000
112,Female,8,78040
70,Female,12,61000
43,Female,9,59600
81,Female,7,75000
50,Female,6,72000
47,Female,18,68000
32,Female,15,87200
59,Female,19,63400


In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
X, y = np.arange(10).reshape((5, 2)), range(5)

In [3]:
X

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [5]:
list(y)

[0, 1, 2, 3, 4]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
X_train

array([[4, 5],
       [0, 1],
       [6, 7]])

In [8]:
y_train

[2, 0, 3]

In [9]:
X_test

array([[2, 3],
       [8, 9]])

In [10]:
y_test

[1, 4]

In [54]:
import pandas as pd

In [60]:
boston = pd.read_csv('Boston.csv')

In [61]:
boston.head()

Unnamed: 0.1,Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [65]:
X=boston.drop('medv', axis=1).values
print(X)

[[1.0000e+00 6.3200e-03 1.8000e+01 ... 1.5300e+01 3.9690e+02 4.9800e+00]
 [2.0000e+00 2.7310e-02 0.0000e+00 ... 1.7800e+01 3.9690e+02 9.1400e+00]
 [3.0000e+00 2.7290e-02 0.0000e+00 ... 1.7800e+01 3.9283e+02 4.0300e+00]
 ...
 [5.0400e+02 6.0760e-02 0.0000e+00 ... 2.1000e+01 3.9690e+02 5.6400e+00]
 [5.0500e+02 1.0959e-01 0.0000e+00 ... 2.1000e+01 3.9345e+02 6.4800e+00]
 [5.0600e+02 4.7410e-02 0.0000e+00 ... 2.1000e+01 3.9690e+02 7.8800e+00]]


In [67]:
y = boston['medv'].values
print(y)

[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.2 20.4 19.3 22.  20.3 20.5 17.3 18.8 21.4
 15.7 16.2 18.  14.3 19.2 19.6 23.  18.4 15.6 18.1 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.8 15.6 14.6 17.8 15.4 21.5 19.6 15.3 19.4
 17.  15.6 13.1 41.3 24.3 23.3 27.  50.  50.  50.  22.7 25.  50.  23.8
 23.8 22.3 17.4 19.1 23.1 23.6 22.6 29.4 23.2 24.6 29.9 37.2 39.8 36.2
 37.9 32.5 26.4 29.6 50.  32.  29.8 34.9 37.  30.5 36.4 31.1 29.1 50.
 33.3 3

In [46]:
import numpy as np

In [48]:
np.unique(iris.target)

array([0, 1, 2])

In [43]:
print(iris.target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [23]:
print(boston.head())

AttributeError: head