In [2]:
# 1. 读取数据
# 2. 扔掉 nan
# 3. wage 转为 0 / 1
# 4. 分割出 x y
# 5. object 转为 数值型
# 6. one-hot
# 7. 标准化
# 8. perceptron
import numpy as np
import pandas as pd

# https://stackoverflow.com/questions/33572118/stop-jupyter-notebook-from-printing-warnings-status-updates-to-terminal?lq=1
import warnings;
warnings.simplefilter('ignore')

class Dataset(object):
	def __init__(self, train_x, train_y, test_x, test_y):
		self.train_x = train_x
		self.train_y = train_y
		self.test_x = test_x
		self.test_y = test_y
		self.pred_y = None

	def print_info(self):
		print(self.train_x)
		print(self.train_y)
		print(self.test_x)
		print(self.test_y)
		print(self.pred_y)

class FeatureEngineer(object):
	def __init__(self):
		# https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
		names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'wage']
		self.names = names

		# read_csv 的参数介绍
		# https://www.cnblogs.com/datablog/p/6127000.html
		# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
		# 
		# header=None, names=names, 数据集没有表头, 我们手动加上标题信息 names
		# na_values 训练集以及测试集中的缺失值用 ? 表示
		train_set = pd.read_csv('adult.data', header=None, names=names, sep=', ', na_values=["?"], engine='python')

		# test 数据集第一行是多余的, 所以skiprows=1
		test_set = pd.read_csv('adult.test', header=None, names=names, sep=', ', na_values=["?"], engine='python', skiprows=1)

		# 如果某一行数据有缺失值, 那就删除这一行
		# 注意 dropna() 不会将下一行的行号提前
		# 比如 train_set 第 15 条记录存在 ？
		# dropna 之后第 16 条记录不会变成第 15 条记录
		# 而是把记录 15 标记为空白。。。
		# 所以不能直接用 for i in range(row) 的方式遍历 pd 数据集
		train_set = train_set.dropna()
		test_set = test_set.dropna()

		# 需要 reset_index(drop=True, inplace=True), 将无效的记录去掉
		# https://stackoverflow.com/questions/40755680/how-to-reset-index-pandas-dataframe-after-dropna-pandas-dataframe
		train_set.reset_index(drop=True, inplace=True)
		test_set.reset_index(drop=True, inplace=True)

		# 将训练集的 "<=50K" 转为标签 0, ">50K" 转为标签 1
		train_set['wage'] = train_set['wage'].replace({'<=50K': -1, '>50K': 1})
		# 测试集中的 "<=50K." 转为标签 0, ">50K." 转为标签 1
		test_set['wage'] = test_set['wage'].replace({'<=50K.': -1, '>50K.': 1})

		self.train_set = train_set
		self.test_set = test_set

	def filter1(self):
		'''
		filter1 仅保留数值型特征
		TODO:
		int to float to rm warnings given by StandardScaler()
		'''

		train_set = self.train_set
		test_set = self.test_set

		train_y = train_set['wage']
		train_set.pop('wage')
		train_set.pop('workclass')
		train_set.pop('education')
		train_set.pop('marital-status')
		train_set.pop('occupation')
		train_set.pop('relationship')
		train_set.pop('race')
		train_set.pop('sex')
		train_set.pop('native-country')
		train_x = train_set

		test_y = test_set['wage']
		test_set.pop('wage')
		test_set.pop('workclass')
		test_set.pop('education')
		test_set.pop('marital-status')
		test_set.pop('occupation')
		test_set.pop('relationship')
		test_set.pop('race')
		test_set.pop('sex')
		test_set.pop('native-country')
		test_x = test_set

		self.train_x = train_x
		self.train_y = train_y
		self.test_x = test_x
		self.test_y = test_y
		# print(train_x.head(20))
		# print(train_y.head(20))

	def filter2(self):
		'''
		TODO:
		1. keep more features
		2. merge train and test, then use one-hot, finally split back into train and test
		'''
		pass

	def standardize(self):
		from sklearn.preprocessing import StandardScaler
		ss = StandardScaler()
		self.train_x = ss.fit_transform(self.train_x)
		self.test_x = ss.fit_transform(self.test_x)

	def get_dataset(self):
		self.dataset = Dataset(self.train_x, self.train_y, self.test_x, self.test_y)
		return self.dataset

def test1():
	'''
	perceptron with sklearn
	'''
	fe = FeatureEngineer()
	fe.filter1()
	fe.standardize()
	dataset = fe.get_dataset()

	from sklearn.linear_model import Perceptron
	perceptron = Perceptron(max_iter = 1, eta0 = 0.01, tol = 1e-3, random_state = 0)
	perceptron.fit(dataset.train_x, dataset.train_y)
	pred_y = perceptron.predict(dataset.test_x).tolist()
	dataset.pred_y = pred_y

	from sklearn.metrics import accuracy_score
	print("准确率: ", accuracy_score(dataset.test_y, dataset.pred_y))

class Perceptron(object):
	'''
	TODO[done]
	实现 Perceptron
	'''
	def __init__(self, dataset, learning_rate=0.001):
		self.dataset = dataset
		self.row = dataset.train_x.shape[0]
		self.col = dataset.train_x.shape[1]
		self.w = np.zeros(self.col)
		self.b = np.zeros(1)
		self.learning_rate = learning_rate

	def update_w(self, y, x):
		eta = self.learning_rate
		# print('w: ', self.w)
		for i in range(self.col):
			self.w[i] += eta * x[i] * y
		# print('w: ', self.w)

	def update_b(self, y):
		eta = self.learning_rate
		# print('b: ', self.b)
		self.b += eta * y
		# print('b: ', self.b)

	def sign(self, x):
		res = 0
		b = self.b
		w = self.w
		for i in range(self.col):
			res += w[i] * x[i]
		res += b
		if res >= 0:
			return 1
		else:
			return -1

	def judge(self, y, b, x=[], w=[]):
		res = 0
		for i in range(self.col):
			res += w[i] * x[i]
		res = (res + b) * y
		return res

	def fit(self):
		# https://docs.python.org/3.5/library/random.html
		import random
		random.seed(0)
		epoch = 100
		row = self.row
		for e in range(epoch):
			i = random.randint(0, row)
			b = self.b
			w = self.w
			x = self.dataset.train_x[i]
			y = self.dataset.train_y[i]
			res = self.judge(y, b, x, w)
			# print('res: ', res)
			if res <= 0:
				self.update_w(y, x)
				self.update_b(y)

	def predict(self):
		test_x = self.dataset.test_x
		row = test_x.shape[0]
		pred_y = np.zeros(row)
		for i in range(row):
			pred_y[i] = self.sign(test_x[i])
			# pred_y[i] = -1 # cheat with accuracy of 75.22%
		self.dataset.pred_y = pred_y

	def cheat_predict(self):
		'''
		You will get a cheat-accuracy of 75.22%
		'''
		test_x = self.dataset.test_x
		row = test_x.shape[0]
		pred_y = np.zeros(row)
		for i in range(row):
			pred_y[i] = -1
		self.dataset.pred_y = pred_y

	def get_accuracy(self):
		from sklearn.metrics import accuracy_score
		print("准确率: ", accuracy_score(self.dataset.test_y, self.dataset.pred_y))

def test2():
	'''
	perceptron from scratch
	'''
	fe = FeatureEngineer()
	fe.filter1()
	fe.standardize()
	dataset = fe.get_dataset()

	perceptron = Perceptron(dataset, 1e-3)
	perceptron.fit()
	perceptron.predict()
	perceptron.get_accuracy()

if __name__ == '__main__':
	# test1()
	test2()


def annotation_and_TODO():
	'''
	结果分析:
	虽然丢掉了很多数据，但是准确率和原来没有丢掉数据相比，并没有太大变化(甚至有提升)
	通过阅读 adult.names 文档可以知道, 训练集的标签分布是【不均匀的】, 0 的比例是 76.07%, 1 的比例是 23.93%
	测试集的标签也有类似的分布, 0 的比例是 75.22%, 1 的比例是 24.78%
	与本次特征工程+模型给出的 77.12% 的准确率很接近
	也就是说, 如果一个模型 model 在任何时候都输出 0, 那这个 model 在测试集上的准确率也会有 75% 左右的准确率, 如 Perceptron.cheat_predict
	另外有一个奇怪的现象: 当训练迭代次数增加, 准确率反而会下降

	本次实验, 首要目的是熟悉过程, 接下来是进一步的改进
	
	TODO1: 实现 Perceptron
	    [done], 在 random.seed(0) 的情况下, 使用 100 次 SGD, 可以达到 77% 左右的准确率
	TODO2: filter1
	TODO3: filter2
	'''
	pass


准确率:  0.7739043824701195
