# Otto商品分类——朴素贝叶斯（原始特征）

我们以Kaggle 2015年举办的Otto Group Product Classification Challenge竞赛数据为例，分别调用
默认参数BernoulliNB、MultinomialNB。

Otto数据集是著名电商Otto提供的一个多类商品分类问题，类别数为9. 每个样本有93维数值型特征（整数，表示某种事件发生的次数，已经进行过脱敏处理）。 竞赛官网：https://www.kaggle.com/c/otto-group-product-classification-challenge/data


In [1]:
# 首先 import 必要的模块
import pandas as pd 
import numpy as np


from matplotlib import pyplot as plt

## 读取数据 

In [8]:
# 读取数据
# path to where the data lies
dpath = './data/'

# 采用原始特征为整数值，可视为离散型特征，因此可用BernoulliNB、MultinomialNB。
# 由于原始特征明显不是高斯分布，因此不能采用GaussianNB。
# 后续我们考虑对log变换后的特征和TD-IDF特征采用GaussianNB
#train = pd.read_csv(dpath +"Otto_FE_train_org.csv")
train = pd.read_csv(dpath +"Otto_FE_train_tfidf.csv")
train.head()

Unnamed: 0,id,feat_1_tfidf,feat_2_tfidf,feat_3_tfidf,feat_4_tfidf,feat_5_tfidf,feat_6_tfidf,feat_7_tfidf,feat_8_tfidf,feat_9_tfidf,...,feat_85_tfidf,feat_86_tfidf,feat_87_tfidf,feat_88_tfidf,feat_89_tfidf,feat_90_tfidf,feat_91_tfidf,feat_92_tfidf,feat_93_tfidf,target
0,1,0.081393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.075886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Class_1
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231403,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Class_1
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19973,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Class_1
3,4,0.011987,0.0,0.0,0.011668,0.105971,0.021681,0.080435,0.0,0.0,...,0.0,0.008244,0.022456,0.0,0.0,0.0,0.0,0.0,0.0,Class_1
4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.124622,0.0,0.0,0.0,0.0,0.145988,0.0,0.0,0.0,Class_1


In [9]:
#train.info()

## 准备数据

In [10]:
# 将类别字符串变成数字
# drop ids and get labels
y_train = train['target']   #形式为Class_x
X_train = train.drop(["id", "target"], axis=1)

#保存特征名字以备后用（可视化）
feat_names = X_train.columns 

#sklearn的学习器大多之一稀疏数据输入，模型训练会快很多
from scipy.sparse import csr_matrix
X_train = csr_matrix(X_train)

## 模型训练

### 默认参数的 BernoulliNB

In [5]:
from sklearn.naive_bayes import BernoulliNB

#1. 生成学习器实例
BerNB = BernoulliNB()

#2. 模型训练
BerNB.fit(X_train, y_train)

#3. 保持模型，用于后续测试
import cPickle
cPickle.dump(BerNB, open("Otto_BernoulliNB.pkl", 'wb'))

In [11]:
from sklearn.naive_bayes import MultinomialNB

#1. 生成学习器实例
MulNB = MultinomialNB()

#2. 模型训练
MulNB.fit(X_train, y_train)

#3. 保持模型，用于后续测试
import cPickle
cPickle.dump(MulNB, open("Otto_MultinomialNB_tfidf.pkl", 'wb'))