#### Code to get CPU or GPU information

In [0]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "99"
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 13541927250977662618, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 9763813298780688455
 physical_device_desc: "device: XLA_CPU device"]

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import itertools
import matplotlib.pyplot as plt
from pylab import rcParams 
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale



TensorFlow will tell you all messages that have the label INFO 

In [0]:
tf.logging.set_verbosity(tf.logging.INFO)


In [0]:
sess = tf.InteractiveSession()

In [0]:
train = pd.read_csv("/content/sample_data/california_housing_train.csv")
print("Shape of all training data with features: ",train.shape)

Shape of all training data with features:  (17000, 9)


In [0]:
train.head(8)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0
5,-114.58,33.63,29.0,1387.0,236.0,671.0,239.0,3.3438,74000.0
6,-114.58,33.61,25.0,2907.0,680.0,1841.0,633.0,2.6768,82400.0
7,-114.59,34.83,41.0,812.0,168.0,375.0,158.0,1.7083,48500.0


In [0]:
train = train.select_dtypes(exclude=['object'])
print("shape of training data with numerical features: ",train.shape)

shape of training data with numerical features:  (17000, 9)


In [0]:
train.fillna(0, inplace= True )

In [0]:
test = pd.read_csv("/content/sample_data/california_housing_test.csv")
test.head(5)
test = test.select_dtypes(exclude=['object'])
test.fillna(0,inplace=True)



In [0]:
print("shape of Test data with numerical features: ", test.shape)
print("shape of training data with numerical features: ", train.shape)

shape of Test data with numerical features:  (3000, 9)
shape of training data with numerical features:  (17000, 9)


In [0]:
print("list of features contained in dataset: ", list(train.columns))

list of features contained in dataset:  ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']


# Isolate outliers with IsolationForest

In [0]:
from sklearn.ensemble import IsolationForest

In [0]:
clf = IsolationForest(max_samples=100, random_state=40)
clf.fit(train)
y_nano = clf.predict(train)
y_nano = pd.DataFrame(y_nano, columns= ['TOP'])
y_nano[y_nano['TOP'] == 1].index.values

train = train.iloc[y_nano[y_nano['TOP'] == 1].index.values]
train.reset_index(drop= True, inplace= True)

print("No. of Outliers:",y_nano[y_nano['TOP'] == -1].shape[0] )
print("No. of rows without Outliers:",train.shape[0] )




No. of Outliers: 1700
No. of rows without Outliers: 15300


In [0]:
clf1 = IsolationForest(max_samples= 1000, random_state = 10)
clf1.fit(train)
ynano = clf1.predict(train)
ynano = pd.DataFrame(ynano, columns= ['TOP'])
ynano[ynano['TOP'] == 1].index.values

train = train.iloc[ynano[ynano['TOP'] == 1].index.values]
train.reset_index(drop= True, inplace= True)


print("No. of Outliers:",ynano[ynano['TOP'] == -1].shape[0] )
print("No. of rows without Outliers:",train.shape[0] )



No. of Outliers: 1700
No. of rows without Outliers: 15300


In [0]:
train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
1,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0
2,-114.58,33.63,29.0,1387.0,236.0,671.0,239.0,3.3438,74000.0
3,-114.58,33.61,25.0,2907.0,680.0,1841.0,633.0,2.6768,82400.0
4,-114.6,33.62,16.0,3741.0,801.0,2434.0,824.0,2.6797,86500.0


In [0]:
print(ynano[ynano['TOP'] == 1].index.values)
print(ynano.head)

[    3     4     5 ... 16979 16982 16996]
<bound method NDFrame.head of        TOP
0       -1
1       -1
2       -1
3        1
4        1
5        1
6        1
7       -1
8       -1
9       -1
10       1
11       1
12      -1
13       1
14      -1
15       1
16       1
17      -1
18      -1
19      -1
20      -1
21      -1
22      -1
23       1
24      -1
25      -1
26       1
27       1
28      -1
29       1
...    ...
16970   -1
16971   -1
16972   -1
16973   -1
16974    1
16975    1
16976   -1
16977    1
16978   -1
16979    1
16980   -1
16981   -1
16982    1
16983   -1
16984   -1
16985   -1
16986   -1
16987   -1
16988   -1
16989   -1
16990   -1
16991   -1
16992   -1
16993   -1
16994   -1
16995   -1
16996    1
16997   -1
16998   -1
16999   -1

[17000 rows x 1 columns]>


In [0]:
train.head(4)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
1,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0
2,-114.58,33.63,29.0,1387.0,236.0,671.0,239.0,3.3438,74000.0
3,-114.58,33.61,25.0,2907.0,680.0,1841.0,633.0,2.6768,82400.0


#MinMaxScalling

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
col_train = list(train.columns)
col_train_bis = list(train.columns)


In [0]:
col_train_bis.remove('median_house_value')

In [0]:
mat_train = np.matrix(train)
mat_test = np.matrix(test)
mat_new = np.matrix(train.drop('median_house_value', axis=1))
mat_y = np.array(train.median_house_value).reshape((15300,1))

In [0]:
from sklearn.preprocessing import MinMaxScaler

In [0]:
prepro_y = MinMaxScaler()
prepro_y.fit(mat_y)

prepro = MinMaxScaler()
prepro.fit(mat_train)



MinMaxScaler(copy=True, feature_range=(0, 1))

In [0]:

prepro_test = MinMaxScaler()
prepro_test.fit(mat_new)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [0]:
train_num_scale = pd.DataFrame(prepro.transform(mat_train),columns = col_train)
test_num_scale = pd.DataFrame(prepro_test.transform(mat_new),columns = col_train_bis)

In [0]:
print(mat_train)

[[-1.1457e+02  3.3640e+01  1.4000e+01 ...  2.2600e+02  3.1917e+00
   7.3400e+04]
 [-1.1457e+02  3.3570e+01  2.0000e+01 ...  2.6200e+02  1.9250e+00
   6.5500e+04]
 [-1.1458e+02  3.3630e+01  2.9000e+01 ...  2.3900e+02  3.3438e+00
   7.4000e+04]
 ...
 [-1.2418e+02  4.0780e+01  3.7000e+01 ...  3.1000e+02  2.5536e+00
   7.0200e+04]
 [-1.2418e+02  4.0620e+01  3.5000e+01 ...  1.7900e+02  3.0536e+00
   1.0700e+05]
 [-1.2427e+02  4.0690e+01  3.6000e+01 ...  4.6500e+02  2.5179e+00
   7.9000e+04]]


In [0]:
print(mat_train.shape)
print(mat_new.shape)


(15300, 9)
(15300, 8)


In [0]:
print(test_num_scale)

       longitude  latitude  ...  households  median_income
0       1.000000  0.115957  ...    0.139744       0.225750
1       1.000000  0.108511  ...    0.162821       0.119517
2       0.998969  0.114894  ...    0.148077       0.238506
3       0.998969  0.112766  ...    0.400641       0.182568
4       0.996907  0.113830  ...    0.523077       0.182811
5       0.996907  0.111702  ...    0.275000       0.094358
6       0.995876  0.242553  ...    0.301923       0.227453
7       0.991753  0.248936  ...    0.251923       0.100572
8       0.991753  0.111702  ...    0.158974       0.206763
9       0.957732  0.135106  ...    0.028205       0.227260
10      0.917526  0.028723  ...    0.209615       0.257770
11      0.917526  0.028723  ...    0.162179       0.120566
12      0.917526  0.027660  ...    0.171154       0.155949
13      0.914433  0.032979  ...    0.105769       0.236359
14      0.913402  0.046809  ...    0.119231       0.243538
15      0.906186  0.026596  ...    0.148077       0.1775

In [0]:
train[col_train_num] = pd.DataFrame(prepro.transform(mat_train), columns= col_train)
test[col_train_bis] = test_num_scale 

NameError: ignored