In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer

# Real Estate Dataset

In [2]:
df_re = pd.read_csv('./real estate dataset/real_estate_price.csv')
df_re.head()

Unnamed: 0,price,size,year,view
0,234314.144,643.09,2015,No sea view
1,228581.528,656.22,2009,No sea view
2,281626.336,487.29,2018,Sea view
3,401255.608,1504.75,2015,No sea view
4,458674.256,1275.46,2009,Sea view


In [3]:
df_re.isnull().sum()

price    0
size     0
year     0
view     0
dtype: int64

In [4]:
enc1 = preprocessing.OrdinalEncoder()
df_re['view'] = enc1.fit_transform(df_re[['view']])

enc = preprocessing.OneHotEncoder(sparse=False)
year = df_re[['year']]
year_enc = enc.fit_transform(year)
df_year = pd.DataFrame(year_enc, columns = enc.get_feature_names_out())
df_re[list(enc.get_feature_names_out())] = df_year[list(enc.get_feature_names_out())]
df_re.drop(columns=['year'], inplace=True)
df_re.head()

Unnamed: 0,price,size,view,year_2006,year_2009,year_2015,year_2018
0,234314.144,643.09,0.0,0.0,0.0,1.0,0.0
1,228581.528,656.22,0.0,0.0,1.0,0.0,0.0
2,281626.336,487.29,1.0,0.0,0.0,0.0,1.0
3,401255.608,1504.75,0.0,0.0,0.0,1.0,0.0
4,458674.256,1275.46,1.0,0.0,1.0,0.0,0.0


In [5]:
std = preprocessing.StandardScaler()
df_re['size'] = std.fit_transform(df_re[['size']])
df_re.head()

Unnamed: 0,price,size,view,year_2006,year_2009,year_2015,year_2018
0,234314.144,-0.708164,0.0,0.0,0.0,1.0,0.0
1,228581.528,-0.663873,0.0,0.0,1.0,0.0,0.0
2,281626.336,-1.233719,1.0,0.0,0.0,0.0,1.0
3,401255.608,2.198445,0.0,0.0,0.0,1.0,0.0
4,458674.256,1.424989,1.0,0.0,1.0,0.0,0.0


In [6]:
X = df_re.drop(columns=['price'])
y = df_re['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
%%time
param = {
    'C': [0.001,0.01,0.1, 1, 10, 100, 1000],
    'kernel':['rbf','linear'],
    'gamma': [0.001,0.01,0.1, 1, 10, 100, 1000]
    
}
cv = RepeatedKFold(n_splits=10, n_repeats=20)

est = SVR()
grid = GridSearchCV(est, param, n_jobs=15, cv=cv, return_train_score=True, verbose=3)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_estimator_)

Fitting 200 folds for each of 98 candidates, totalling 19600 fits
0.6231233481100132
SVR(C=1000, gamma=0.001, kernel='linear')
Wall time: 12.2 s


In [8]:
svm = SVR(C=1000, gamma=0.001, kernel='linear')
svm.fit(X_train, y_train)
print('Training Accuracy: ', cross_val_score(svm, X_train, y_train, cv=cv, n_jobs=15).mean())
print('Testing Accuracy: ', svm.score(X_test,y_test))

Training Accuracy:  0.6265493052832952
Testing Accuracy:  0.6490473168574904


In [9]:
svm = LinearSVR(C=1000)
svm.fit(X_train, y_train)
print('Training Accuracy: ', cross_val_score(svm, X_train, y_train, cv=cv, n_jobs=15).mean())
print('Testing Accuracy: ', svm.score(X_test,y_test))

Training Accuracy:  -9.984250564077394
Testing Accuracy:  -3.267959427337855


# Titanic Dataset

In [10]:
df_ti = pd.read_csv("./titanic dataset/train.csv")
df_ti.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
df_ti.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
df_ti.drop(columns=['PassengerId','Name', 'Ticket', 'Cabin'],inplace=True)
df_ti.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [13]:
imp_mean = SimpleImputer(strategy='mean')
df_ti['Age'] = imp_mean.fit_transform(df_ti[['Age']])

imp_mf = SimpleImputer(strategy='most_frequent')
df_ti['Embarked'] = imp_mf.fit_transform(df_ti[['Embarked']])
df_ti.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [14]:
enc = preprocessing.OneHotEncoder(sparse=False)
emsex = df_ti[['Sex','Embarked']]
emsex_enc = enc.fit_transform(emsex)

df_ti[list(enc.get_feature_names_out())] = pd.DataFrame(emsex_enc, columns =  enc.get_feature_names_out())
df_ti.drop(columns=['Sex', 'Embarked'], inplace=True)
df_ti.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0
1,1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0
2,1,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0
3,1,1,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0
4,0,3,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0


In [15]:
std = preprocessing.StandardScaler()
df_ti['Age'] = std.fit_transform(df_ti[['Age']])
df_ti['Fare'] = std.fit_transform(df_ti[['Fare']])
df_ti.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,-0.592481,1,0,-0.502445,0.0,1.0,0.0,0.0,1.0
1,1,1,0.638789,1,0,0.786845,1.0,0.0,1.0,0.0,0.0
2,1,3,-0.284663,0,0,-0.488854,1.0,0.0,0.0,0.0,1.0
3,1,1,0.407926,1,0,0.42073,1.0,0.0,0.0,0.0,1.0
4,0,3,0.407926,0,0,-0.486337,0.0,1.0,0.0,0.0,1.0


In [16]:
X = df_ti.drop(columns=['Survived'])
y = df_ti['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [17]:
%%time
param = {
    'C': [0.001,0.01,0.1, 1, 10, 100, 1000],
    'kernel':['rbf','linear'],
    'gamma': [0.001,0.01,0.1, 1, 10, 100, 1000]
    
}
cv = RepeatedKFold(n_splits=10, n_repeats=20)

est = SVR()
grid = GridSearchCV(est, param, n_jobs=15, cv=5, return_train_score=True, verbose=3)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_params_)

Fitting 5 folds for each of 98 candidates, totalling 490 fits
0.3776877586523156
SVR(C=1, gamma=0.1)
{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
Wall time: 1min 40s


In [18]:
svm = SVR(C=1, gamma=0.1, kernel='rbf')
svm.fit(X_train, y_train)
print('Training Accuracy: ', cross_val_score(svm, X_train, y_train, cv=cv, n_jobs=15).mean())
print('Testing Accuracy: ', svm.score(X_test,y_test))

Training Accuracy:  0.38708831792148873
Testing Accuracy:  0.3959109256819301


# Student Grade Prediction Dataset

In [12]:
df_sg = pd.read_csv('./student grade dataset/student-mat.csv')
df_sg.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [13]:
cols = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

enc = preprocessing.OneHotEncoder(sparse=False)
df_preenc = df_sg[cols]
df_enc = enc.fit_transform(df_preenc)
df_enc = pd.DataFrame(df_enc, columns=enc.get_feature_names_out())

df_sg.drop(columns=cols, inplace=True)
df_sg[list(enc.get_feature_names_out())] = df_enc[list(enc.get_feature_names_out())]
g3 = df_sg['G3']
df_sg.drop(columns=['G3'], inplace=True)
df_sg['G3'] = g3
df_sg.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes,G3
0,18,4,4,2,2,0,4,3,4,1,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,6
1,17,1,1,1,2,0,5,3,3,1,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,6
2,15,1,1,1,2,3,4,3,2,2,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,10
3,15,4,2,1,3,0,3,2,2,1,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,15
4,16,3,3,1,2,0,4,3,2,1,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,10


In [14]:
X = df_sg.drop(columns = ['G3'])
y = df_sg['G3']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [22]:
%%time
param = {
    'C': [0.001,0.01,0.1, 1, 10, 100, 1000],
    'kernel':['rbf','linear'],
    'gamma': [0.001,0.01,0.1, 1, 10, 100, 1000]
}

cv = RepeatedKFold(n_splits=5, n_repeats=10)

est = SVR()
grid = GridSearchCV(est, param, n_jobs=15, cv=5, return_train_score=True, verbose=3)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_params_)

Fitting 5 folds for each of 98 candidates, totalling 490 fits
0.8486864764123851
SVR(C=10, gamma=0.01)
{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Wall time: 8min 58s


In [23]:
svm = SVR(C=10, gamma=0.01, kernel='rbf')
svm.fit(X_train, y_train)
print('Training Accuracy: ', cross_val_score(svm, X_train, y_train, cv=cv, n_jobs=15).mean())
print('Testing Accuracy: ', svm.score(X_test,y_test))

Training Accuracy:  0.8400237257296012
Testing Accuracy:  0.8513278184098974


# Mercedez Benz greener manufacturing dataset

#### dataset - https://www.kaggle.com/competitions/mercedes-benz-greener-manufacturing/data

In [15]:
df_mb = pd.read_csv('./mercedez dataset/train.csv')
df_mb

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8405,107.39,ak,s,as,c,d,aa,d,q,...,1,0,0,0,0,0,0,0,0,0
4205,8406,108.77,j,o,t,d,d,aa,h,h,...,0,1,0,0,0,0,0,0,0,0
4206,8412,109.22,ak,v,r,a,d,aa,g,e,...,0,0,1,0,0,0,0,0,0,0
4207,8415,87.48,al,r,e,f,d,aa,l,u,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df_mb.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df_mb.isnull().sum()

ID      0
y       0
X0      0
X1      0
X2      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 378, dtype: int64

In [18]:
cols = ['X0','X1','X2','X3','X4','X5','X6','X8']
enc = preprocessing.OneHotEncoder(sparse=False)
cat_data = df_mb[cols]
cat_data_enc = enc.fit_transform(cat_data)

df_mb[list(enc.get_feature_names_out())] = pd.DataFrame(cat_data_enc, columns = enc.get_feature_names_out())
df_mb.drop(columns=cols, inplace=True)
df_mb.head()

  self[k1] = value[k2]


Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,0,130.81,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6,88.53,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,76.26,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,9,80.62,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13,78.02,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
X = df_mb.drop(columns=['ID','y'])
y = df_mb['y']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [29]:
%%time
param = {
    'C': [0.001,0.01,0.1, 1, 10, 100, 1000],
    'kernel':['rbf','linear'],
    'gamma': [0.001,0.01,0.1, 1, 10, 100, 1000]
}

cv = RepeatedKFold(n_splits=5, n_repeats=10)

est = SVR()
grid = GridSearchCV(est, param, n_jobs=6, cv=5, return_train_score=True, verbose=3)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_params_)

Fitting 5 folds for each of 98 candidates, totalling 490 fits
0.5401682245748921
SVR(C=100, gamma=0.001)
{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
Wall time: 3h 27min 22s


In [20]:
%%time
svm = SVR(C=100, gamma=0.001, kernel='rbf')
svm.fit(X_train, y_train)
cv = RepeatedKFold(n_splits=5, n_repeats=10)
print('Training Accuracy: ', cross_val_score(svm, X_train, y_train, cv=5, n_jobs=15).mean())
print('Testing Accuracy: ', svm.score(X_test,y_test))

Training Accuracy:  0.5267782988809595
Testing Accuracy:  0.578169492714204
Wall time: 12.7 s


In [22]:
param = {
    'C': [0.001,0.01,0.1, 1, 10, 100, 1000]
}

cv = RepeatedKFold(n_splits=5, n_repeats=10)

est = LinearSVR(max_iter=1000000)
grid = GridSearchCV(est, param, n_jobs=15, cv=5, return_train_score=True, verbose=3)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_params_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
0.5243317750675912
LinearSVR(C=10, max_iter=1000000)
{'C': 10}


In [None]:
svm = LinearSVR(C=)
svm.fit(X_train, y_train)
print('Training Accuracy: ', cross_val_score(svm, X_train, y_train, cv=cv, n_jobs=15).mean())
print('Testing Accuracy: ', svm.score(X_test,y_test))