In [None]:
# 2020-10-21 created by Akson
# 2021-01-10 edit

In [None]:
# Code1-1
# 本书第一段代码，预测塞浦路斯人有多幸福

# 引入基本的依赖包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 引入所需要的模型包
from sklearn.linear_model import LinearRegression  # 线性回归
from sklearn.neighbors import KNeighborsRegressor  # KNN

In [None]:
# Code1-2
# 加载数据

# 这里使用了放在google硬盘里的数据
oecd_bli = pd.read_csv('/content/drive/MyDrive/datasets/lifesat/oecd_bli_2015.csv', thousands = ',')
gdp_per_capita = pd.read_csv('/content/drive/MyDrive/datasets/lifesat/gdp_per_capita.csv', thousands = ',', delimiter = '\t', encoding = 'latin1', na_values = 'n/a')

In [None]:
# Code1-3
# 数据准备工作

# 从两组数据中分别提取出GDP，幸福指数与其对应的国家信息
def prepare_country_states(oecd_bli, gdp_per_capita):
    # print(oecd_bli)
    # print(gdp_per_capita)
    oecd_bli = oecd_bli[oecd_bli['INEQUALITY'] == 'TOT']
    # print(oecd_bli)
    oecd_bli = oecd_bli.pivot(index = 'Country', columns = 'Indicator', values = 'Value')
    gdp_per_capita.rename(columns = {'2015': 'GDP per capita'}, inplace = True)
    # print(gdp_per_capita.index)
    gdp_per_capita.set_index('Country', inplace = True)  # 有时会发生加载的数据本来就是以Country作为索引的情况，所以这行得视情况注释
    
    full_country_stats = pd.merge(left = oecd_bli, right = gdp_per_capita, left_index = True, right_index = True)
    full_country_stats.sort_values(by = 'GDP per capita', inplace = True)
    remove_indices = [0, 1, 6, 8, 33, 34, 35]
    keep_indices = list(set(range(36)) - set(remove_indices))
    
    # print(full_country_stats[['GDP per capita', 'Life satisfaction']].iloc[keep_indices])
    return full_country_stats[['GDP per capita', 'Life satisfaction']].iloc[keep_indices]


In [None]:
# Code1-4
# 数据准备工作

country_stats = prepare_country_states(oecd_bli, gdp_per_capita)

# 将数据拆分为适合训练模型的形式（n行m（这里是1）列）
# 这是书上的写法
# X_train = np.c_[country_stats['GDP per capita']]
# y_train = np.c_[country_stats['Life satisfaction']]
# 这是我自己的写法
X_train = np.array(country_stats['GDP per capita']).reshape((-1, 1))
y_train = np.array(country_stats['Life satisfaction']).reshape((-1, 1))

# 这里也可以把对应的国家名称也变成numpy数组的形式
country_name = np.array(country_stats.index)

In [None]:
# Code1-5
# 将获得的数据可视化

country_stats.plot(kind = 'scatter', x = 'GDP per capita', y = 'Life satisfaction')
plt.show()

In [None]:
# Code1-6
# 选择一种模型并且训练它，不，我还是两种模型都试试吧！

model_LR = LinearRegression()
model_KNN = KNeighborsRegressor()

model_LR.fit(X_train, y_train)
model_KNN.fit(X_train, y_train)

# 这个是待预测塞浦路斯人GDP数据
X_test = [[22587]]

print('Life satisfaction with LR: %s' % model_LR.predict(X_test))
print('Life satisfaction with KNN: %s' % model_KNN.predict(X_test))

# 从结果中可以看出，用LR预测幸福指数是比较高的，以后我们就多用LR