In [1]:
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import kaggle

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder


from sklearn.pipeline import make_pipeline

# Univariate Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Recursive Feature Selection
from sklearn.feature_selection import RFE

# Feature Selection :PCA
from sklearn.decomposition import PCA


from sklearn import svm

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split

In [2]:
# MySelf
data = "/Users/sanjayk/Dropbox/Work/Careerera/Final/ML/data/"

In [3]:
#Modify this path according to your system path : Where you kept ur python notebook file

kaggle.api.authenticate()
kaggle.api.dataset_download_files("ashydv/housing-dataset", path=data, unzip=True)

In [4]:
df = pd.read_csv(data+'Housing.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [5]:
print(f'Size of the dataset : {df.shape}')

Size of the dataset : (545, 13)


In [6]:
# Data Processing Step:

In [7]:
# Is there any null Values: 
for column in df.columns:
    print(f'{column:20} : {df[column].dtypes }  : {df[column].isnull().sum()}')

price                : int64  : 0
area                 : int64  : 0
bedrooms             : int64  : 0
bathrooms            : int64  : 0
stories              : int64  : 0
mainroad             : object  : 0
guestroom            : object  : 0
basement             : object  : 0
hotwaterheating      : object  : 0
airconditioning      : object  : 0
parking              : int64  : 0
prefarea             : object  : 0
furnishingstatus     : object  : 0


In [8]:
# Centre Mean Algorithms for replacing null values : Mean/Median/Mode
# Else if number of rows are more then 50k
#df.dropna(inplace=True)

In [9]:
df.furnishingstatus.value_counts()

furnishingstatus
semi-furnished    227
unfurnished       178
furnished         140
Name: count, dtype: int64

In [10]:
# Converting the Categorical Variables
# Dependent/Target Variable : LabelEncoder
# Independent Variable : One-Hot-Enconding  (get_dummies) pd.mapping() yes/no
# Independent Variables : Ordinal Encoding : more than 2 when you can rank/value to each categorical variable

In [11]:
ytemp = df['furnishingstatus']
le = LabelEncoder()
le.fit(ytemp)
y = le.transform(ytemp)
y

array([0, 0, 1, 0, 0, 1, 1, 2, 0, 2, 0, 1, 1, 0, 1, 1, 2, 0, 0, 1, 1, 2,
       0, 0, 0, 0, 1, 1, 2, 1, 2, 1, 0, 2, 0, 0, 0, 0, 2, 1, 0, 0, 2, 1,
       0, 1, 0, 0, 2, 1, 2, 2, 0, 1, 1, 2, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 2, 0, 0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 2, 1,
       2, 0, 1, 0, 0, 1, 1, 1, 0, 1, 2, 2, 2, 2, 1, 0, 0, 2, 1, 2, 1, 1,
       1, 2, 0, 0, 0, 1, 2, 0, 0, 1, 0, 1, 1, 0, 0, 1, 2, 2, 0, 1, 2, 1,
       1, 2, 1, 2, 2, 1, 1, 0, 2, 1, 1, 2, 0, 0, 1, 1, 1, 1, 2, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 2, 2, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 2, 0, 1, 2, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 2, 2, 1, 0, 1, 2,
       1, 1, 1, 0, 2, 1, 1, 0, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 2, 1, 2, 1, 0, 1, 1, 1, 0, 1, 2, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 2, 1, 1, 2, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 2, 0, 2, 0, 0, 1,
       0, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 0, 1,

#### While one-hot encoding excels in handling nominal data without introducing bias, ordinal encoding is suitable for preserving order information among categories. By choosing the appropriate encoding technique based on the nature of the data, we ensure that our models make informed decisions and perform optimally.

    https://www.datacamp.com/tutorial/categorical-data

In [47]:
NX = df.drop(['furnishingstatus'],axis=1)
oe =  OrdinalEncoder()
oe.fit(NX)
X = oe.transform(NX)
xTrain, xTest, yTrain, yTest = train_test_split(X,y,train_size=0.80,random_state=42)
# Creating a object of Logistic Regression Class
clf = svm.SVC(C=0.01,kernel='linear')
#clf  = svm.SVC(C=1.0,kernel='rbf',gamma=0.001)
#clf  = svm.SVC(C=1.0,kernel='sigmoid',gamma=0.001)
#clf  = svm.SVC(C=1.0,kernel='poly',gamma=0.001,degree=4)
#clf  = svm.SVC(C=1.0,kernel='poly',gamma=0.001)
clf.fit(xTrain,yTrain)
#started : 6:16 PM
yPred = clf.predict(xTest)
print(f"SVM Regression Classification Report:")
print(classification_report(yTest, yPred))

SVM Regression Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.46      0.85      0.60        39
           2       0.61      0.56      0.58        41

    accuracy                           0.51       109
   macro avg       0.36      0.47      0.39       109
weighted avg       0.39      0.51      0.43       109



In [48]:
NX = df.drop(['furnishingstatus'],axis=1)
ohe =  OneHotEncoder()
ohe.fit(NX)
X = ohe.transform(NX)
xTrain, xTest, yTrain, yTest = train_test_split(X,y,train_size=0.80,random_state=42)
# Creating a object of Logistic Regression Class
clf = svm.SVC(C=0.01,kernel='linear')
#clf  = svm.SVC(C=1.0,kernel='rbf',gamma=0.001)
#clf  = svm.SVC(C=1.0,kernel='sigmoid',gamma=0.001)
#clf  = svm.SVC(C=1.0,kernel='poly',gamma=0.001,degree=4)
#clf  = svm.SVC(C=1.0,kernel='poly',gamma=0.001)
clf.fit(xTrain,yTrain)
#started : 6:16 PM
yPred = clf.predict(xTest)
print(f"SVM Regression Classification Report:")
print(classification_report(yTest, yPred))

SVM Regression Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.36      1.00      0.53        39
           2       0.00      0.00      0.00        41

    accuracy                           0.36       109
   macro avg       0.12      0.33      0.18       109
weighted avg       0.13      0.36      0.19       109



In [49]:
NX = df.drop(['furnishingstatus'],axis=1)
NX = pd.get_dummies(NX,dtype=int)
X = NX.values
xTrain, xTest, yTrain, yTest = train_test_split(X,y,train_size=0.80,random_state=42)
# Creating an object of Logistic Regression Class
clf = svm.SVC(C=0.01,kernel='linear')
#clf  = svm.SVC(C=1.0,kernel='rbf',gamma=0.001)
#clf  = svm.SVC(C=1.0,kernel='sigmoid',gamma=0.001)
#clf  = svm.SVC(C=1.0,kernel='poly',gamma=0.001,degree=4)
#clf  = svm.SVC(C=1.0,kernel='poly',gamma=0.001)
clf.fit(xTrain,yTrain)
#started : 6:16 PM
yPred = clf.predict(xTest)
print(f"SVM Regression Classification Report:")
print(classification_report(yTest, yPred))

SVM Regression Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.48      0.45        29
           1       0.45      0.59      0.51        39
           2       0.68      0.41      0.52        41

    accuracy                           0.50       109
   macro avg       0.52      0.50      0.49       109
weighted avg       0.53      0.50      0.50       109



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
parameters = {
              'kernel': ['linear', 'rbf','sigmoid','poly'],
              'C' : [ 0.01, 0.1, 1.0, 10.0]
             }

clf = svm.SVC()
gs = GridSearchCV(clf, parameters)
gs.fit(X,y)

# Get the best parameters and estimator
print(f'Best Parameter : {gs.best_params_}')
print(f'Best Estimator : {gs.best_estimator_}')

In [14]:
NX = df.drop(['furnishingstatus'],axis=1)
NX = pd.get_dummies(NX,dtype=int)
X = NX.values
xTrain, xTest, yTrain, yTest = train_test_split(X,y,train_size=0.80,random_state=42)

# Creating Pipeline with Univariate Feature Selection
clf = make_pipeline(SelectKBest(f_classif,k=4), svm.SVC(C=0.01,kernel='linear'))
#clf = svm.SVC(C=0.01,kernel='linear')
#clf  = svm.SVC(C=1.0,kernel='rbf',gamma=0.001)
#clf  = svm.SVC(C=1.0,kernel='sigmoid',gamma=0.001)
#clf  = svm.SVC(C=1.0,kernel='poly',gamma=0.001,degree=4)
#clf  = svm.SVC(C=1.0,kernel='poly',gamma=0.001)
clf.fit(xTrain,yTrain)
#started : 6:16 PM
yPred = clf.predict(xTest)
print(f"SVM Regression Classification Report:")
print(classification_report(yTest, yPred))

SVM Regression Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.24      0.29        29
           1       0.50      0.59      0.54        39
           2       0.61      0.66      0.64        41

    accuracy                           0.52       109
   macro avg       0.49      0.50      0.49       109
weighted avg       0.51      0.52      0.51       109



In [None]:
#help(RFE)

In [None]:
NX = df.drop(['furnishingstatus'],axis=1)
NX = pd.get_dummies(NX,dtype=int)
X = NX.values
xTrain, xTest, yTrain, yTest = train_test_split(X,y,train_size=0.80,random_state=42)

# Creating Pipeline with Recursive FeatureSelection
clf = make_pipeline(RFE(estimator=svm.SVC(C=0.01,kernel='linear'), n_features_to_select=4), svm.SVC(C=0.01,kernel='linear'))
#clf = svm.SVC(C=0.01,kernel='linear')
#clf  = svm.SVC(C=1.0,kernel='rbf',gamma=0.001)
#clf  = svm.SVC(C=1.0,kernel='sigmoid',gamma=0.001)
#clf  = svm.SVC(C=1.0,kernel='poly',gamma=0.001,degree=4)
#clf  = svm.SVC(C=1.0,kernel='poly',gamma=0.001)
clf.fit(xTrain,yTrain)
#started : 6:16 PM
yPred = clf.predict(xTest)
print(f"SVM Regression Classification Report:")
print(classification_report(yTest, yPred))

In [None]:
pca = PCA(n_components=2)
NX = df.drop(['furnishingstatus'],axis=1)
NX = pd.get_dummies(NX,dtype=int)
X = NX.values
xTrain, xTest, yTrain, yTest = train_test_split(X,y,train_size=0.80,random_state=42)

# Creating Pipeline with PCA
clf = make_pipeline(PCA(n_components=4), svm.SVC(C=0.01,kernel='linear'))

#clf = svm.SVC(C=0.01,kernel='linear')
#clf  = svm.SVC(C=1.0,kernel='rbf',gamma=0.001)
#clf  = svm.SVC(C=1.0,kernel='sigmoid',gamma=0.001)
#clf  = svm.SVC(C=1.0,kernel='poly',gamma=0.001,degree=4)
#clf  = svm.SVC(C=1.0,kernel='poly',gamma=0.001)
clf.fit(xTrain,yTrain)
#started : 6:16 PM
yPred = clf.predict(xTest)
print(f"SVM Regression Classification Report:")
print(classification_report(yTest, yPred))