In [41]:
# 1. 读取数据
# 2. 扔掉 nan
# 3. wage 转为 0 / 1
# 4. 分割出 x y
# 5. object 转为 数值型
# 6. one-hot
# 7. 标准化
# 8. perceptron
import numpy as np
import pandas as pd

class Dataset(object):
	def __init__(self, train_x, train_y, test_x, test_y):
		self.train_x = train_x
		self.train_y = train_y
		self.test_x = test_x
		self.test_y = test_y
		self.pred_y = None

	def print_info(self):
		print(self.train_x)
		print(self.train_y)
		print(self.test_x)
		print(self.test_y)
		print(self.pred_y)


class FeatureEngineer(object):
	def __init__(self):
		# https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
		names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'wage']
		self.names = names

		# read_csv 的参数介绍
		# https://www.cnblogs.com/datablog/p/6127000.html
		# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
		# 
		# header=None, names=names, 数据集没有表头, 我们手动加上标题信息 names
		# na_values 训练集以及测试集中的缺失值用 ? 表示
		train_set = pd.read_csv('adult.data', header=None, names=names, sep=', ', na_values=["?"], engine='python')

		# test 数据集第一行是多余的, 所以skiprows=1
		test_set = pd.read_csv('adult.test', header=None, names=names, sep=', ', na_values=["?"], engine='python', skiprows=1).dropna()

		# 如果某一行数据有缺失值, 那就删除这一行
		train_set = train_set.dropna()
		test_set = test_set.dropna()

		# 将训练集的 "<=50K" 转为标签 0, ">50K" 转为标签 1
		train_set['wage'] = train_set['wage'].replace({'<=50K': 0, '>50K': 1})
		# 测试集中的 "<=50K." 转为标签 0, ">50K." 转为标签 1
		test_set['wage'] = test_set['wage'].replace({'<=50K.': 0, '>50K.': 1})

		self.train_set = train_set
		self.test_set = test_set

	def filter1(self):
		'''
		filter1 仅保留数值型特征
		TODO:
		int to float to rm warnings given by StandardScaler()
		'''

		train_set = self.train_set
		test_set = self.test_set

		train_y = train_set['wage']
		train_set.pop('wage')
		train_set.pop('workclass')
		train_set.pop('education')
		train_set.pop('marital-status')
		train_set.pop('occupation')
		train_set.pop('relationship')
		train_set.pop('race')
		train_set.pop('sex')
		train_set.pop('native-country')
		train_x = train_set

		test_y = test_set['wage']
		test_set.pop('wage')
		test_set.pop('workclass')
		test_set.pop('education')
		test_set.pop('marital-status')
		test_set.pop('occupation')
		test_set.pop('relationship')
		test_set.pop('race')
		test_set.pop('sex')
		test_set.pop('native-country')
		test_x = test_set

		self.train_x = train_x
		self.train_y = train_y
		self.test_x = test_x
		self.test_y = test_y

	def filter2(self):
		'''
		TODO:
		1. keep more features
		2. merge train and test, then use one-hot, finally split back into train and test
		'''
		pass

	def standardize(self):
		from sklearn.preprocessing import StandardScaler
		ss = StandardScaler()
		self.train_x = ss.fit_transform(self.train_x)
		self.test_x = ss.fit_transform(self.test_x)

	def get_dataset(self):
		self.dataset = Dataset(self.train_x, self.train_y, self.test_x, self.test_y)
		return self.dataset

def test1():
	fe = FeatureEngineer()
	fe.filter1()
	fe.standardize()
	dataset = fe.get_dataset()

	from sklearn.linear_model import Perceptron
	perceptron = Perceptron(max_iter = 1, eta0 = 0.01, tol = 1e-3, random_state = 0)
	perceptron.fit(dataset.train_x, dataset.train_y)
	pred_y = perceptron.predict(dataset.test_x).tolist()
	dataset.pred_y = pred_y

	from sklearn.metrics import accuracy_score
	print("准确率：", accuracy_score(dataset.test_y, dataset.pred_y))

test1()


'''
结果分析:
虽然丢掉了很多数据，但是准确率和原来没有丢掉数据相比，并没有太大变化(甚至有提升)
通过阅读 adult.names 文档可以知道, 训练集的标签分布是【不均匀的】, 0 的比例是 76.07%, 1 的比例是 23.93%
测试集的标签也有类似的分布, 0 的比例是 75.22%, 1 的比例是 24.78%
与本次特征工程+模型给出的 77.12% 的准确率很接近
也就是说, 如果一个模型 model 在任何时候都输出 0, 那这个 model 在测试集上的准确率也会有 75% 左右的准确率

本次实验, 首要目的是熟悉过程, 接下来是进一步的改进
TODO1: filter1
TODO2: filter2
'''

准确率： 0.7712483399734396


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
