In [14]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
import seaborn as sns

## Data Import

In [15]:
with open('cleaned_data.csv', 'r', encoding="utf8") as inp:
    readdata = csv.reader(inp)
    datain = np.array(list(readdata))
    header = datain[0,] 
    data = datain[1:,] 
    
print(header)  
print(data[0,:])
print(data.shape)

['id' 'name' 'host_id' 'host_name' 'neighbourhood_group' 'neighbourhood'
 'latitude' 'longitude' 'room_type' 'price' 'minimum_nights'
 'number_of_reviews' 'last_review' 'reviews_per_month'
 'calculated_host_listings_count' 'availability_365']
['2539' 'Clean & quiet apt home by the park' '2787' 'John' 'Brooklyn'
 'Kensington' '40.64749' '-73.97237' 'Private room' '149' '1' '9'
 '2018-10-19T00:00:00Z' '0.21' '6' '365']
(28878, 16)


## Find and clear missing values

In [16]:
np.argwhere(data == "")

array([[  150,     3],
       [ 1464,     3],
       [ 2010,     1],
       [ 3235,     3],
       [ 3246,     1],
       [ 3359,     1],
       [ 3713,     3],
       [ 4665,     3],
       [ 5646,     3],
       [ 6926,     1],
       [ 7729,     1],
       [ 8060,     3],
       [ 8379,     3],
       [11270,     3],
       [17353,     1],
       [18519,     3]])

In [17]:
cleaned_data = np.delete(arr=data,obj=np.argwhere(data == ""),axis=0) 

In [18]:
np.argwhere(cleaned_data == "")

array([], shape=(0, 2), dtype=int64)

In [19]:
cleaned_data 

array([['2539', 'Clean & quiet apt home by the park', '2787', ...,
        '0.21', '6', '365'],
       ['5022', 'Entire Apt: Spacious Studio/Loft by central park',
        '7192', ..., '0.1', '1', '0'],
       ['5295', 'Beautiful 1br on Upper West Side', '7702', ..., '0.43',
        '1', '6'],
       ...,
       ['36438336', 'Seas The Moment', '211644523', ..., '1', '1', '87'],
       ['36442252', '1B-1B apartment near by Metro', '273841667', ...,
        '2', '1', '40'],
       ['36455809', 'Cozy Private Room in Bushwick, Brooklyn',
        '74162901', ..., '1', '1', '1']], dtype='<U161')

## Find float columns

In [20]:
numeric_index = np.char.isdigit(cleaned_data)
numeric_index

array([[ True, False,  True, ..., False,  True,  True],
       [ True, False,  True, ..., False,  True,  True],
       [ True, False,  True, ..., False,  True,  True],
       ...,
       [ True, False,  True, ...,  True,  True,  True],
       [ True, False,  True, ...,  True,  True,  True],
       [ True, False,  True, ...,  True,  True,  True]])

In [21]:
#Find total numbers of rows and columns
cleaned_data.shape

(28860, 16)

In [22]:
for i in range(np.shape(cleaned_data)[1]):
    try:
        cleaned_data[:,i].astype(float)
        print(i, "Float Column")
    except ValueError:
        continue

0 Float Column
2 Float Column
6 Float Column
7 Float Column
9 Float Column
10 Float Column
11 Float Column
13 Float Column
14 Float Column
15 Float Column


In [23]:
float_column = cleaned_data[:,[0,2,6,7,9,10,11,13,14,15]].astype(float)
float_column

array([[2.53900000e+03, 2.78700000e+03, 4.06474900e+01, ...,
        2.10000000e-01, 6.00000000e+00, 3.65000000e+02],
       [5.02200000e+03, 7.19200000e+03, 4.07985100e+01, ...,
        1.00000000e-01, 1.00000000e+00, 0.00000000e+00],
       [5.29500000e+03, 7.70200000e+03, 4.08031600e+01, ...,
        4.30000000e-01, 1.00000000e+00, 6.00000000e+00],
       ...,
       [3.64383360e+07, 2.11644523e+08, 4.05417900e+01, ...,
        1.00000000e+00, 1.00000000e+00, 8.70000000e+01],
       [3.64422520e+07, 2.73841667e+08, 4.08078700e+01, ...,
        2.00000000e+00, 1.00000000e+00, 4.00000000e+01],
       [3.64558090e+07, 7.41629010e+07, 4.06980500e+01, ...,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00]])

In [24]:
float_header = header[[0,2,6,7,9,10,11,13,14,15]]
float_header

array(['id', 'host_id', 'latitude', 'longitude', 'price',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='<U161')

## Histogram

In [26]:
dropdown = widgets.Dropdown(
    options=float_header,
    description='Columns:',
    disabled=False,
)
def plot_hist(column_name):
    idx = np.argwhere(float_header == column_name)[0][0]
    _ = plt.hist(float_column[:,[idx]],bins="auto")
    title = "Histogram with " + column_name
    plt.title(title)
    plt.show()
widgets.interact(plot_hist,column_name = dropdown)

  silent = bool(old_value == new_value)


interactive(children=(Dropdown(description='Columns:', options=('id', 'host_id', 'latitude', 'longitude', 'pri…

<function __main__.plot_hist>