In [74]:
from sklearn.svm import SVC
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [19]:
df = pd.read_csv('abalone_csv.csv')
# Take first hundred data
abalone = df[:100]
abalone.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Class_number_of_rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [20]:
# Perform dummy
dummy = pd.get_dummies(abalone['Sex'])
# Prevent dummy trap
dummy.drop(columns=dummy.columns[-1],inplace=True)

# Concat for new data
abalone.drop(columns='Sex', inplace=True)
new_abalone = pd.concat([abalone,dummy], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [21]:
new_abalone.head()

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Class_number_of_rings,F,I
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1


In [62]:
# Split into X and y
X = new_abalone.drop(columns='Class_number_of_rings')
y = new_abalone['Class_number_of_rings']

In [70]:
# Define model and grid search
GS = GridSearchCV(SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=True)
GS.fit(X, y)
Score = pd.DataFrame(GS.cv_results_)[['param_C','param_kernel','mean_test_score']]



In [71]:
# Full score
Score

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.18
1,1,linear,0.17
2,10,rbf,0.18
3,10,linear,0.2
4,20,rbf,0.19
5,20,linear,0.2


Randomized Grid Search

In [78]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
# define setting for models
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [80]:
scores = []

for model_name, mp in model_params.items():
    RGS =  RandomizedSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    RGS.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': RGS.best_score_,
        'best_params': RGS.best_params_
    })
    
all_score = pd.DataFrame(scores,columns=['model','best_score','best_params'])



In [81]:
all_score

Unnamed: 0,model,best_score,best_params
0,svm,0.2,"{'kernel': 'linear', 'C': 10}"
1,random_forest,0.2,{'n_estimators': 5}
2,logistic_regression,0.21,{'C': 1}
