In [86]:
import numpy as np
import pandas as pd
import sklearn


In [127]:
from sklearn.preprocessing import MinMaxScaler , OneHotEncoder , LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error,accuracy_score

In [88]:
df = pd.read_csv("./datasets/housePrice.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [89]:
df = df.drop(columns=['date'])

In [90]:
X = df.drop(columns=['price'])
y = df['price']

In [91]:
numerical_features = X.select_dtypes(exclude=["object"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns
print(numerical_features)
print(categorical_features)

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated'],
      dtype='object')
Index(['street', 'city', 'statezip', 'country'], dtype='object')


In [92]:
numerical_pipe = Pipeline([('impute' , SimpleImputer(strategy='mean')) , ('minMax' , MinMaxScaler())])
categorical_pipe = Pipeline([('impute', SimpleImputer(strategy='most_frequent')) , ('oe' , OneHotEncoder(handle_unknown='ignore'))])
preprocessing_pipe = ColumnTransformer([('numerical' , numerical_pipe , numerical_features) , ('categorical',categorical_pipe , categorical_features)])
X = preprocessing_pipe.fit_transform(X)


In [8]:
X_train,X_test,y_train,y_test = train_test_split(X , y , test_size=0.2,random_state=0)

In [9]:
model = LinearRegression()
model.fit(X_train,y_train)


LinearRegression()

In [10]:
y_pred = model.predict(X_test)
mean_absolute_percentage_error(y_pred , y_test)

0.41450178506635965

In [11]:
model.score(X_test , y_test)

0.6048223049324686

## RBF


In [130]:
from sklearn.cluster import KMeans
import math

In [131]:
df = pd.read_csv("./datasets/cancer.csv")
df = df.drop(columns=['id'])

In [132]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [133]:
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']

In [134]:
numerical_features = X.select_dtypes(exclude=["object"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns
print(numerical_features)
print(categorical_features)

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')
Index([], dtype='object')


In [135]:
numerical_pipe = Pipeline([('impute' , SimpleImputer(strategy='mean')) , ('minMax' , MinMaxScaler())])
categorical_pipe = Pipeline([('impute', SimpleImputer(strategy='most_frequent')) , ('oe' , OneHotEncoder(handle_unknown='ignore'))])
preprocessing_pipe = ColumnTransformer([('numerical' , numerical_pipe , numerical_features) , ('categorical',categorical_pipe , categorical_features)])
X = preprocessing_pipe.fit_transform(X)
y = LabelEncoder().fit_transform(y)

In [136]:
X_train,X_test,y_train,y_test = train_test_split(X , y , test_size=0.2,random_state=0)

In [138]:
K_cent= 2
km= KMeans(n_clusters= K_cent, max_iter= 100)
km.fit(X_train)
cent= km.cluster_centers_

In [139]:

max=0 
for i in range(K_cent):
	for j in range(K_cent):
		d= np.linalg.norm(cent[i]-cent[j])
		if(d> max):
			max= d
d= max

sigma= d/math.sqrt(2*K_cent)
     


In [140]:
shape= X_train.shape
row= shape[0]
column= K_cent
G= np.empty((row,column), dtype= float)

In [141]:


for i in range(row):
  for j in range(column):
    dist= np.linalg.norm(X_train[i]-cent[j])
    G[i][j]= math.exp(-math.pow(dist,2)/math.pow(2*sigma,2))
     


In [142]:

GTG= np.dot(G.T,G)
GTG_inv= np.linalg.inv(GTG)
fac= np.dot(GTG_inv,G.T)
W= np.dot(fac,y_train)

In [143]:


row= X_test.shape[0]
column= K_cent
G_test= np.empty((row,column), dtype= float)
for i in range(row):
	for j in range(column):
		dist= np.linalg.norm(X_test[i]-cent[j])
		G_test[i][j]= math.exp(-math.pow(dist,2)/math.pow(2*sigma,2))
     


In [146]:
prediction= np.dot(G_test,W)
prediction= 0.5*(np.sign(prediction-0.5)+1)

# score= mean_absolute_percentage_error(prediction,y_test)
print(accuracy_score(prediction , y_test))

0.868421052631579
