In [11]:
import pickle
import urllib
import pandas as pd 
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_mldata
from sklearn.datasets import load_svmlight_file
from sklearn.datasets import fetch_covtype
from sklearn.datasets import fetch_20newsgroups

#pickle.dump(mnist, open( "mnist.pickle", "wb" ))

download_ijcnn1=False
if download_ijcnn1:
    target_page ='http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.bz2'
    with urllib.request.urlopen(target_page) as response:
        with open('../datasets/ijcnn1.bz2','wb') as W:
            W.write(response.read())

download_pickle=False
if download_pickle:
    target_page ='http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/cadata'
    cadata = load_svmlight_file(urllib.request.urlopen(target_page))
    pickle.dump(cadata, open( "../datasets/cadata.pickle", "wb" ))

download_covertype=False
if download_covertype:
    covertype_dataset = fetch_covtype(random_state=101, shuffle=True)
    pickle.dump(covertype_dataset, open( "../datasets/covertype_dataset.pickle", "wb" ))

download_newsgroups=False
if download_newsgroups:
    newsgroups_dataset = fetch_20newsgroups(shuffle=True, remove=('headers','footers', 'quotes'), random_state=6)
    pickle.dump(newsgroups_dataset, open( "../datasets/newsgroups_dataset.pickle", "wb" ))

download_boston=True
if download_boston:
    boston = load_boston()
    df = pd.DataFrame(data=boston['data'], columns = boston['feature_names'])
    df.to_csv('../datasets/boston.txt', sep = ',', index = False)    


## Load boston dataset

- in the example, 80 percent of our dataset goes in training and 20 percent in test

In [146]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

boston = load_boston()

X_train, X_test, Y_train, Y_test = train_test_split(boston.data,boston.target, test_size=0.2, random_state=0)
print(boston.data.shape,boston.target.shape)

(506, 13) (506,)


- We're going to train and fit the regressor in the training set and predict the target variable in the test dataset. We are then going to measure the accuracy of the regression task by using the MAE score.

## LinearRegression

In [147]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

regr = LinearRegression()
regr.fit(X_train, Y_train)
Y_pred = regr.predict(X_test)

print ("MAE", mean_absolute_error(Y_test, Y_pred))

MAE 3.842810589450487


In [148]:
import timeit

%timeit regr.fit(X_train, Y_train)

866 µs ± 116 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [149]:
import numpy as np

#accuracy =precision
avg_price_house = np.average(boston.target)-5    # calculo de promedio - 5
high_priced_idx = (Y_train >= avg_price_house) # array verdadero/falso segun condicion


In [150]:
high_priced_idx[:60]

array([ True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True, False,  True, False,  True, False,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True, False, False, False, False, False,  True,  True, False,
        True, False, False,  True,  True,  True,  True, False,  True,
        True, False, False,  True,  True, False,  True,  True,  True,
        True, False, False, False, False,  True])

#### Los registros que cumplen la condicion se marcan como 1

In [151]:
Y_train[high_priced_idx] = 1 # Los registros que complen la condicion se marcan con 1
Y_train[:60]

array([ 1. ,  1. ,  1. ,  1. , 10.4,  1. ,  1. ,  1. ,  1. , 17.2,  1. ,
        1. , 16.5,  1. ,  8.4,  1. ,  9.7,  1. ,  1. , 12.3,  1. ,  1. ,
        1. ,  1. ,  1. ,  1. ,  1. ,  1. , 13.1,  7.5, 13.6, 17.4,  8.4,
        1. ,  1. , 13.4,  1. ,  7.2, 13.1,  1. ,  1. ,  1. ,  1. , 16.6,
        1. ,  1. , 11. ,  7.2,  1. ,  1. , 14.4,  1. ,  1. ,  1. ,  1. ,
       15.2, 17.4, 13.6,  8.7,  1. ])

#### A los valores que no son True se les asigna 0 

In [152]:
Y_train[np.logical_not(high_priced_idx)] = 0

In [153]:
Y_train = Y_train.astype(np.int8)
Y_train[:60]

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1], dtype=int8)

#### Se realiza la operacion anterior para Y_test

In [154]:
high_priced_idx = (Y_test >= avg_price_house)
Y_test[high_priced_idx] = 1
Y_test[np.logical_not(high_priced_idx)] = 0
Y_test = Y_test.astype(np.int8)


## LogisticRegression

In [155]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

clf = LogisticRegression()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)

print (classification_report(Y_test, Y_pred))


             precision    recall  f1-score   support

          0       0.79      0.66      0.72        29
          1       0.87      0.93      0.90        73

avg / total       0.85      0.85      0.85       102



- The precision and recall values are over 80 percent. This is already a good result for a very simple method

In [156]:
%timeit clf.fit(X_train, Y_train)

4.42 ms ± 100 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
