# 离散值处理

### 字段说明
Rank:排名
Name:游戏名
Platform:平台
Year:发售年份
Genre:游戏类型
Publisher:发行商
NA_Sales:北美地区销量
EU_Sales:欧洲地区销量
JP_Sales:日本地区销量
Other_Sales:其他地区销量
Global_Sales:全球销量

In [1]:
import pandas as pd
import numpy as np

In [5]:
vg_df = pd.read_csv("datasets/vgsales.csv",encoding = "ISO-8859-1")
vg_df.shape

Unnamed: 0,Name,Platform,Year,Genre,Publisher
0,Wii Sports,Wii,2006.0,Sports,Nintendo
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo
5,Tetris,GB,1989.0,Puzzle,Nintendo
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo


In [6]:
vg_df.head() 
vg_df[['Name','Platform','Year','Genre','Publisher']].iloc[0:7] #查看制定行和列

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [7]:
# 游戏种类（取某列唯一值，二者效果相同）
genre = np.unique(vg_df['Genre']);genre
# genreA = pd.unique(vg_df['Genre']);genreA

array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy'], dtype=object)

## LabelEncoder标签编码

#### 将离散属性用数字表示，将转换成连续的数值型变量。即是对不连续的数字或者文本进行编号

### Label encoding在某些情况下很有用，但是场景限制很多。再举一例：比如有[dog,cat,dog,mouse,cat]，我们把其转换为[1,2,1,3,2]。这里就产生了一个奇怪的现象：dog和mouse的平均值是cat。所以目前还没有发现标签编码的广泛使用。

In [26]:
from sklearn.preprocessing import LabelEncoder

In [11]:
gle = LabelEncoder();gle

LabelEncoder()

In [12]:
# 各游戏类型对应编码
genre_labels = gle.fit_transform(vg_df['Genre']) #等同于 fit + transform
genre_labels

array([10,  4,  6, ...,  6,  5,  4], dtype=int64)

In [13]:
gle.classes_  (list(gle.classes_))
#所含所有离散类型（相当于np.unique(vg_df['Genre'])）

array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy'], dtype=object)

In [18]:
# 列出所有属性对应的数字
genre_mappings = {index: label for index, label in enumerate(gle.classes_)}
genre_mappings

{0: 'Action',
 1: 'Adventure',
 2: 'Fighting',
 3: 'Misc',
 4: 'Platform',
 5: 'Puzzle',
 6: 'Racing',
 7: 'Role-Playing',
 8: 'Shooter',
 9: 'Simulation',
 10: 'Sports',
 11: 'Strategy'}

In [27]:
vg_df['GenreLabel'] = genre_labels
vg_df[['Name','Platform','Year','Genre','GenreLabel']].iloc[1:7]

Unnamed: 0,Name,Platform,Year,Genre,GenreLabel
1,Super Mario Bros.,NES,1985.0,Platform,4
2,Mario Kart Wii,Wii,2008.0,Racing,6
3,Wii Sports Resort,Wii,2009.0,Sports,10
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,7
5,Tetris,GB,1989.0,Puzzle,5
6,New Super Mario Bros.,DS,2006.0,Platform,4


## Map

In [33]:
poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8')
poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True)

np.unique(poke_df['Generation'])

(800, 13)