## Machine Learning Capstone Project (Indra Chatterjee)

##### Library Imports

In [1]:
import numpy as np   
from sklearn.linear_model import LinearRegression
import pandas as pd    
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns
from sklearn.model_selection import train_test_split # Sklearn package's randomized data splitting function

##### Utility Methods

In [2]:
#Function to print out shape and size of data set for given patients
def printShapeAndSize(houseDataSet):
    if houseDataSet is None:
        print('Null Data set received : Error')
        return
    print('Size of Dataset  {}'.format(houseDataSet.size))
    print('Shape of Dataset {}'.format(houseDataSet.shape))
    print('Data Types :{}'.format(houseDataSet.dtypes))
    print('----------------------------------------------------')

In [3]:
#Read the Data file here. 
housePriceData = pd.read_csv('innercity.csv')
printShapeAndSize(housePriceData)

Size of Dataset  497099
Shape of Dataset (21613, 23)
Data Types :cid                   int64
dayhours             object
price                 int64
room_bed              int64
room_bath           float64
living_measure        int64
lot_measure           int64
ceil                float64
coast                 int64
sight                 int64
condition             int64
quality               int64
ceil_measure          int64
basement              int64
yr_built              int64
yr_renovated          int64
zipcode               int64
lat                 float64
long                float64
living_measure15      int64
lot_measure15         int64
furnished             int64
total_area            int64
dtype: object
----------------------------------------------------


#### Printing the Datatype of the data set it can be observed that, dayhours field is an object type which could lead to null or not a number values. 

In [4]:
#Displaying top 10 rows of the dataset
housePriceData.head(10)

Unnamed: 0,cid,dayhours,price,room_bed,room_bath,living_measure,lot_measure,ceil,coast,sight,...,basement,yr_built,yr_renovated,zipcode,lat,long,living_measure15,lot_measure15,furnished,total_area
0,3034200666,20141107T000000,808100,4,3.25,3020,13457,1.0,0,0,...,0,1956,0,98133,47.7174,-122.336,2120,7553,1,16477
1,8731981640,20141204T000000,277500,4,2.5,2550,7500,1.0,0,0,...,800,1976,0,98023,47.3165,-122.386,2260,8800,0,10050
2,5104530220,20150420T000000,404000,3,2.5,2370,4324,2.0,0,0,...,0,2006,0,98038,47.3515,-121.999,2370,4348,0,6694
3,6145600285,20140529T000000,300000,2,1.0,820,3844,1.0,0,0,...,0,1916,0,98133,47.7049,-122.349,1520,3844,0,4664
4,8924100111,20150424T000000,699000,2,1.5,1400,4050,1.0,0,0,...,0,1954,0,98115,47.6768,-122.269,1900,5940,0,5450
5,5525400430,20140715T000000,585000,3,2.5,2050,11690,2.0,0,0,...,0,1989,0,98059,47.5279,-122.161,2410,10172,1,13740
6,2419600075,20141201T000000,465000,3,1.75,1480,6360,1.0,0,0,...,0,1954,0,98133,47.7311,-122.353,1480,6360,0,7840
7,114101161,20140829T000000,480000,3,1.5,2100,67269,1.0,0,0,...,880,1949,0,98028,47.7592,-122.23,1610,15999,0,69369
8,7011201550,20140707T000000,780000,4,2.0,2600,4800,1.0,0,2,...,1200,1953,0,98119,47.637,-122.371,2050,3505,0,7400
9,7203000640,20140918T000000,215000,4,1.0,1130,7400,1.0,0,0,...,0,1969,0,98003,47.3437,-122.316,1540,7379,0,8530


In [5]:
# Method to check if all column values for a given data set has unique values
def isColumnUnique(dataframe=None, columnName=None):
    if dataframe is None or columnName is None:
        raise ValueError('Dataframe or columnname cannot be null or empty')
    rowsInDf = dataframe.shape[0]
    uniqueRowsInColumn = len(pd.unique(dataframe[columnName]))
    print(f'Rows in DF {rowsInDf} and Count of unique values in {columnName} is {uniqueRowsInColumn}')
    if rowsInDf == uniqueRowsInColumn:
        return True
    else:
        return False

In [6]:
#From the above data, there are many variables that would affect the price and we need to find the co-relation between them. 
# cid column can be dropped as it is unique for all the rows.
# isColumnUnique(housePriceData, "cid")
# isColumnUnique(housePriceData, "dayhours")
for column in housePriceData.columns.tolist():
    isColumnUnique(housePriceData, column)

#From the above we see that column cid has almost as many unique values as the dataset itself. 
houseDf =housePriceData.drop(columns=["cid"])
houseDf.head(1)

Rows in DF 21613 and Count of unique values in cid is 21436
Rows in DF 21613 and Count of unique values in dayhours is 372
Rows in DF 21613 and Count of unique values in price is 3625
Rows in DF 21613 and Count of unique values in room_bed is 13
Rows in DF 21613 and Count of unique values in room_bath is 30
Rows in DF 21613 and Count of unique values in living_measure is 1038
Rows in DF 21613 and Count of unique values in lot_measure is 9782
Rows in DF 21613 and Count of unique values in ceil is 6
Rows in DF 21613 and Count of unique values in coast is 2
Rows in DF 21613 and Count of unique values in sight is 5
Rows in DF 21613 and Count of unique values in condition is 5
Rows in DF 21613 and Count of unique values in quality is 12
Rows in DF 21613 and Count of unique values in ceil_measure is 946
Rows in DF 21613 and Count of unique values in basement is 306
Rows in DF 21613 and Count of unique values in yr_built is 116
Rows in DF 21613 and Count of unique values in yr_renovated is 70

Unnamed: 0,dayhours,price,room_bed,room_bath,living_measure,lot_measure,ceil,coast,sight,condition,...,basement,yr_built,yr_renovated,zipcode,lat,long,living_measure15,lot_measure15,furnished,total_area
0,20141107T000000,808100,4,3.25,3020,13457,1.0,0,0,5,...,0,1956,0,98133,47.7174,-122.336,2120,7553,1,16477


In [7]:
houseDf.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,21613.0,540182.158793,367362.231718,75000.0,321950.0,450000.0,645000.0,7700000.0
room_bed,21613.0,3.370842,0.930062,0.0,3.0,3.0,4.0,33.0
room_bath,21613.0,2.114757,0.770163,0.0,1.75,2.25,2.5,8.0
living_measure,21613.0,2079.899736,918.440897,290.0,1427.0,1910.0,2550.0,13540.0
lot_measure,21613.0,15106.967566,41420.511515,520.0,5040.0,7618.0,10688.0,1651359.0
ceil,21613.0,1.494309,0.539989,1.0,1.0,1.5,2.0,3.5
coast,21613.0,0.007542,0.086517,0.0,0.0,0.0,0.0,1.0
sight,21613.0,0.234303,0.766318,0.0,0.0,0.0,0.0,4.0
condition,21613.0,3.40943,0.650743,1.0,3.0,3.0,4.0,5.0
quality,21613.0,7.656873,1.175459,1.0,7.0,7.0,8.0,13.0


#### Visualization

In [None]:
#SNS Visualization
sns.pairplot(houseDf, diag_kind='kde')