## Importing Packages

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline

## Reading CSV

In [2]:
gastos_cartao = pd.read_csv("base_gastos_cartao.csv")

In [3]:
gastos_cartao

Unnamed: 0,Gastos_Cartao,Idade,Renda,Impostos,Segmento
0,510,35,1120,60,C
1,490,30,1120,60,C
2,470,32,1040,60,C
3,460,31,1200,60,C
4,500,36,1120,60,C
5,540,39,1360,120,C
6,460,34,1120,90,C
7,500,34,1200,60,C
8,440,29,1120,60,C
9,490,31,1200,30,C


In [4]:
gastos_cartao.shape

(150, 5)

In [5]:
gastos_cartao.head()

Unnamed: 0,Gastos_Cartao,Idade,Renda,Impostos,Segmento
0,510,35,1120,60,C
1,490,30,1120,60,C
2,470,32,1040,60,C
3,460,31,1200,60,C
4,500,36,1120,60,C


#### Import Train Test Split

In [6]:
from sklearn.model_selection import train_test_split

#### Creating matrix of features and target
#### Repair the categorical feature

In [7]:
X = pd.concat([gastos_cartao[['Idade', 'Renda', 'Impostos']], pd.get_dummies(gastos_cartao.Segmento)], axis=1)
y = gastos_cartao.Gastos_Cartao
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [8]:
print(X_train.shape)
print(X_test.shape)

(105, 6)
(45, 6)


## SVM

#### importing SVM module

In [9]:
from sklearn.svm import SVR

#### Creating a simple model

In [10]:
clf = SVR(C=1.0, epsilon=0.2)

In [11]:
clf.fit(X_train, y_train) 

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [12]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)

MSE: 6426.4504


#### Optimizing hyper-parameters

In [13]:
tuned_parameters = [{'kernel': ['rbf', 'linear'], 
                     'epsilon': [0.1, 0.001],
                     'C': [10, 1000]}]

In [None]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(SVR(), tuned_parameters, cv=5, scoring='r2')
clf.fit(X_train, y_train)

#### Printing performance

In [None]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)
print()