In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os 
import sys
import time
import tensorflow as tf
from tensorflow import keras

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
print(housing.DESCR)
print(housing.data.shape)
print(housing.target.shape)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [3]:
#分割数据集为：0.7训练集，0.1验证集，0.2测试集

from sklearn.model_selection import train_test_split

x_train_all,x_test,y_train_all,y_test = train_test_split(
        housing.data,
        housing.target,
        random_state = 7,
        test_size=0.2)

x_train,x_valid,y_train,y_valid = train_test_split(
        x_train_all,
        y_train_all,
        random_state=11,
        test_size=0.125)

print(x_train.shape,y_train.shape)
print(x_valid.shape,y_valid.shape)
print(x_test.shape,y_test.shape)

(14448, 8) (14448,)
(2064, 8) (2064,)
(4128, 8) (4128,)


In [4]:
#归一化操作  x = (s-u)/std   减去均值，除以方差

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()


x_train_scaler = scaler.fit_transform(x_train)
x_valid_scaler = scaler.transform(x_valid)
x_test_scaler = scaler.transform(x_test)

In [5]:
output_dir = "generate_csv"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

def save_to_csv(output_dir, data, name_prefix, header=None, n_parts=10):
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    filenames = []
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)),n_parts)):
        part_csv = path_format.format(name_prefix,file_idx)
        filenames.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header + "\n")
            for row_index in row_indices:
                f.write(".".join([repr(col) for col in data[row_index]]))
                f.write('\n')
                
    return filenames



# np.c_将x,y按行merge到一起
train_data = np.c_[x_train_scaler,y_train]
valid_data = np.c_[x_valid_scaler,y_valid]
test_data = np.c_[x_test_scaler,y_test]
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)


train_filenames = save_to_csv(output_dir, train_data, "train", header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, "valid", header_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, "test", header_str, n_parts=10)

In [6]:
# 将csv文件做成dataset
# 1.filename ——> dataset
# 2.read file ——>datasets——>merge
# 3.parse csv

#第一步
filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
    print(filename)

tf.Tensor(b'generate_csv\\train_07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_11.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_12.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\

In [8]:
# 第二步

n_readers = 5
dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filename).skip(1), # 按行读取文件，忽略第一行的header
    cycle_length = 5
)

for line in dataset.take(15):
    print(line.numpy())
    

b'-0.5687498121582446.-0.6839500714571032.-0.5744204742578485.0.2012054449732681.1.2049227044431197.-0.05090126825905318.-0.6696956958191057.0.5659680258034845.1.405'
b'0.9314946022148806.0.3463524390603567.0.0770311506543982.-0.19001211133248155.-0.16727471480020087.0.028420670665991438.-0.8102343545280389.0.7853385485962785.2.028'
b'0.3691379802417763.0.26709839978978284.-0.3008978077378238.-0.1962179848021484.-0.078327997253277.-0.12176531847855512.-0.8477113301837532.0.5958821880025021.3.812'
b'-0.4664841610183416.-0.763204110727677.-0.7674069999147513.-0.08599256244182751.1.0607081235660683.-0.08660197136701311.-0.6696956958191057.0.5559966384038143.2.021'
b'-1.202138755658554.-0.28767987510423404.-0.14411769350093734.0.061946351535992025.-0.504063256968165.-0.06862474035776597.1.8037846974580825.-0.7602264983529142.0.781'
b'1.3604567027317471.-1.0009662285393985.0.579532723789923.-0.2248450785493213.0.5503439481269227.0.02097144062348479.-0.7212265373457135.0.855138260393984.2.63

In [9]:
# 第三步
# tf.io.decde_csv(str,record_defaults)
# 将csv解析为tensor

sample_str = '1,2,3,4,5'
record_defaults = [tf.constant(0,dtype=tf.int32)] * 5
parsed_fields = tf.io.decode_csv(sample_str,record_defaults)
print(parsed_fields)

[<tf.Tensor: id=130, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=131, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=132, shape=(), dtype=int32, numpy=3>, <tf.Tensor: id=133, shape=(), dtype=int32, numpy=4>, <tf.Tensor: id=134, shape=(), dtype=int32, numpy=5>]
