In [1]:
import pandas as pd
from io import StringIO

### Creating data

In [2]:
csv_data = '''
A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
9.0,10.0,11.0,
13.0,14.0,15.0,16.0
'''

In [3]:
df = pd.read_csv(StringIO(csv_data))

In [4]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,9.0,10.0,11.0,
3,13.0,14.0,15.0,16.0


In [5]:
type(df)

pandas.core.frame.DataFrame

In [6]:
'''isnull method returns a dataframe with boolean values 
that indicate whether a cell contians a nemuric value or if data is missing.'''
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True
3,False,False,False,False


In [7]:
# The number of missing values per column
df.isnull().sum(axis=0)

A    0
B    0
C    1
D    1
dtype: int64

In [8]:
# The number of missing values per sample
df.isnull().sum(axis=1)

0    0
1    1
2    1
3    0
dtype: int64

### Eliminating samples or features with missing values

In [9]:
# remove missing samples
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
3,13.0,14.0,15.0,16.0


In [10]:
# remove missing columns
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,9.0,10.0
3,13.0,14.0


In [11]:
# remove rows where all columns are NaN
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,9.0,10.0,11.0,
3,13.0,14.0,15.0,16.0


In [12]:
# remove rows that have less than 4 real avlues
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
3,13.0,14.0,15.0,16.0


In [13]:
# only remove rows where NaN appear in specific columns
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,9.0,10.0,11.0,
3,13.0,14.0,15.0,16.0


### Imputing missing values

In [14]:
from sklearn.preprocessing import Imputer

In [15]:
# axis = 0 calculates the column mean
# axis = 1 calculates the row mean
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)

In [16]:
imr = imr.fit(df.values)

In [17]:
imputed_data = imr.transform(df.values)

In [18]:
imputed_data

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  9.66666667,  8.        ],
       [ 9.        , 10.        , 11.        ,  9.33333333],
       [13.        , 14.        , 15.        , 16.        ]])

### Mappling ordinal features

In [19]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class2'],
    ['blue', 'XL', 15.3, 'class1']
])

In [20]:
df.columns =['color', 'size', 'price', 'label']

In [21]:
df

Unnamed: 0,color,size,price,label
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [22]:
size_mapping = {
    'XL': 3,
    'L': 2,
    'M': 1
}

In [23]:
df['size'] = df['size'].map(size_mapping)

In [24]:
df

Unnamed: 0,color,size,price,label
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [25]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}

In [26]:
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

### Encoding class labels

In [27]:
import numpy as np

In [28]:
unique_labels = np.unique(df['label'])

In [29]:
class_mapping = {label:idx for idx, label in enumerate(unique_labels)}

In [30]:
class_mapping

{'class1': 0, 'class2': 1}

In [31]:
df['label'] = df['label'].map(class_mapping)

In [32]:
df

Unnamed: 0,color,size,price,label
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [33]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}

In [34]:
df['label'].map(inv_class_mapping)

0    class1
1    class2
2    class1
Name: label, dtype: object

In [35]:
# sklearn LabelEncoder
from sklearn.preprocessing import LabelEncoder

In [36]:
class_le = LabelEncoder()

In [37]:
# fit_transform method is a shortcut for calling fit and transform separately
y = class_le.fit_transform(df['label'].values)

In [38]:
y

array([0, 1, 0])

In [39]:
class_le.inverse_transform(y)

  if diff:


array([0, 1, 0])

### Performing one-hot encoding on nominal features

In [40]:
from sklearn.preprocessing import OneHotEncoder

In [41]:
df

Unnamed: 0,color,size,price,label
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [42]:
X = df[['color', 'size', 'price']].values

In [43]:
color_le= LabelEncoder()

In [44]:
X[:, 0] = color_le.fit_transform(X[:, 0])

In [45]:
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [46]:
from sklearn.preprocessing import OneHotEncoder

In [47]:
ohe = OneHotEncoder(categorical_features=[0])

In [48]:
# convert sparse matrix representation into a regular(dense) NumPy array
ohe.fit_transform(X).toarray()

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [49]:
# get_dummies method creates dummy features via one-hot encoding, it only converts string columns.
pd.get_dummies(df[['color', 'size', 'price']])

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,0,1,0
1,2,13.5,0,0,1
2,3,15.3,1,0,0


In [50]:
# to address multicollinearity, use drop_first
pd.get_dummies(df[['color', 'size', 'price']], drop_first=True)

Unnamed: 0,size,price,color_green,color_red
0,1,10.1,1,0
1,2,13.5,0,1
2,3,15.3,0,0


### Partitioning a dataset into separate training and test sets

In [51]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data' , header=None)

In [52]:
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']

In [53]:
np.unique(df_wine['Class label'])

array([1, 2, 3])

In [54]:
df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [55]:
from sklearn.model_selection import train_test_split

In [56]:
X = df_wine.iloc[:, 1:].values

In [57]:
y = df_wine.iloc[:, 0].values

In [58]:
# stratify ensures that both training and test datasets have the same class proportion as the original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [59]:
X_train.shape

(124, 13)

In [60]:
X_test.shape

(54, 13)

### Bringing features onto the samle scale

In [61]:
ex = np.array([0, 1, 2, 3, 4, 5])

In [62]:
(ex-ex.min())/(ex.max()-ex.min())

array([0. , 0.2, 0.4, 0.6, 0.8, 1. ])

In [63]:
(ex-ex.mean())/ex.std()

array([-1.46385011, -0.87831007, -0.29277002,  0.29277002,  0.87831007,
        1.46385011])

In [64]:
from sklearn.preprocessing import MinMaxScaler

In [65]:
data = [[-1, 2], 
        [-0.5, 6], 
        [0, 10], 
        [1, 18]]

In [66]:
mms = MinMaxScaler()

In [67]:
mms.fit_transform(data)

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [68]:
from sklearn.preprocessing import StandardScaler

In [69]:
stds = StandardScaler()

In [70]:
stds.fit_transform(data)

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

### A compelte example

In [71]:
training = pd.read_csv('training.csv')

In [72]:
test = pd.read_csv('test.csv')

In [73]:
training.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Target
0,LP001032,Male,No,0,Graduate,No,4950,0.0,125,360,1,Urban,Y
1,LP001824,Male,Yes,1,Graduate,No,2882,1843.0,123,480,1,Semiurban,Y
2,LP002928,Male,Yes,0,Graduate,No,3000,3416.0,56,180,1,Semiurban,Y
3,LP001814,Male,Yes,2,Graduate,No,9703,0.0,112,360,1,Urban,Y
4,LP002244,Male,Yes,0,Graduate,No,2333,2417.0,136,360,1,Urban,Y


In [74]:
#kNN without preprocessing
from sklearn.neighbors import KNeighborsClassifier

In [75]:
knn = KNeighborsClassifier()

In [76]:
numeric_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

In [77]:
knn.fit(training[numeric_features], training['Target'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [78]:
from sklearn.metrics import accuracy_score

In [79]:
accuracy_score(test['Target'], knn.predict(test[numeric_features]))

0.6145833333333334

### Mapping ordinal features

In [80]:
dependents_mapping = {
    '3+': 4,
    '2': 3,
    '1': 2,
    '0':1
}

In [81]:
training['Dependents'] = training['Dependents'].map(dependents_mapping)

In [82]:
test['Dependents'] = test['Dependents'].map(dependents_mapping)

In [83]:
training.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Target
0,LP001032,Male,No,1,Graduate,No,4950,0.0,125,360,1,Urban,Y
1,LP001824,Male,Yes,2,Graduate,No,2882,1843.0,123,480,1,Semiurban,Y
2,LP002928,Male,Yes,1,Graduate,No,3000,3416.0,56,180,1,Semiurban,Y
3,LP001814,Male,Yes,3,Graduate,No,9703,0.0,112,360,1,Urban,Y
4,LP002244,Male,Yes,1,Graduate,No,2333,2417.0,136,360,1,Urban,Y


### Encoding class labels

In [84]:
class_le = LabelEncoder()

In [85]:
class_le.fit(training['Target'].values)

LabelEncoder()

In [86]:
training['Target'] = class_le.transform(training['Target'].values)

In [87]:
test['Target'] = class_le.transform(test['Target'].values)

### Performing one-hot encoding on nominal features

In [88]:
nominal_features = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

In [89]:
training.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Target
0,LP001032,Male,No,1,Graduate,No,4950,0.0,125,360,1,Urban,1
1,LP001824,Male,Yes,2,Graduate,No,2882,1843.0,123,480,1,Semiurban,1
2,LP002928,Male,Yes,1,Graduate,No,3000,3416.0,56,180,1,Semiurban,1
3,LP001814,Male,Yes,3,Graduate,No,9703,0.0,112,360,1,Urban,1
4,LP002244,Male,Yes,1,Graduate,No,2333,2417.0,136,360,1,Urban,1


In [90]:
training_dummies = pd.get_dummies(training[nominal_features], drop_first=True)

In [91]:
training_dummies.head()

Unnamed: 0,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,1,0,0,0,0,1
1,1,1,0,0,1,0
2,1,1,0,0,1,0
3,1,1,0,0,0,1
4,1,1,0,0,0,1


In [92]:
training = pd.concat([training, training_dummies],axis=1)

In [93]:
training.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Target,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,LP001032,Male,No,1,Graduate,No,4950,0.0,125,360,1,Urban,1,1,0,0,0,0,1
1,LP001824,Male,Yes,2,Graduate,No,2882,1843.0,123,480,1,Semiurban,1,1,1,0,0,1,0
2,LP002928,Male,Yes,1,Graduate,No,3000,3416.0,56,180,1,Semiurban,1,1,1,0,0,1,0
3,LP001814,Male,Yes,3,Graduate,No,9703,0.0,112,360,1,Urban,1,1,1,0,0,0,1
4,LP002244,Male,Yes,1,Graduate,No,2333,2417.0,136,360,1,Urban,1,1,1,0,0,0,1


In [94]:
training = training.drop(columns=nominal_features)

In [95]:
training.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Target,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,LP001032,1,4950,0.0,125,360,1,1,1,0,0,0,0,1
1,LP001824,2,2882,1843.0,123,480,1,1,1,1,0,0,1,0
2,LP002928,1,3000,3416.0,56,180,1,1,1,1,0,0,1,0
3,LP001814,3,9703,0.0,112,360,1,1,1,1,0,0,0,1
4,LP002244,1,2333,2417.0,136,360,1,1,1,1,0,0,0,1


In [96]:
test_dummies = pd.get_dummies(test[nominal_features], drop_first=True)
test = pd.concat([test, test_dummies],axis=1)
test = test.drop(columns=nominal_features)

In [97]:
test.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Target,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,LP002684,1,3400,0,95,360,1,0,0,0,1,0,0,0
1,LP001907,1,14583,0,436,360,1,1,1,1,0,0,1,0
2,LP001205,1,2500,3796,120,360,1,1,1,1,0,0,0,1
3,LP001275,2,3988,0,50,240,1,1,1,1,0,0,0,1
4,LP002455,3,3859,0,96,360,1,1,1,1,0,0,1,0


### Bringing features onto the samle scale

In [98]:
X_training = training.drop(columns=['Loan_ID', 'Target'])
y_training = training['Target']
X_test = test.drop(columns=['Loan_ID', 'Target'])
y_test = test['Target']

In [99]:
min_max = MinMaxScaler()

In [103]:
min_max.fit(X_training)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [105]:
train_minmax = min_max.transform(X_training)

In [106]:
test_minmax = min_max.transform(X_test)

In [107]:
knn.fit(training_minmax, y_training)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [108]:
accuracy_score(y_test, knn.predict(test_minmax))

0.6875