# 数据获取

In [None]:
from sklearn.datasets import load_digits

# 数据预处理

## 划分训练集/测试集

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_data, train_target, test_size=0.4, random_state=0, stratify=y_train
)
# train_data：所要划分的样本特征集(不含label)
# train_target：所要划分的样本结果
# test_size：样本占比，如果是整数的话就是样本的数量
# random_state：是随机数的种子。
    # 随机数种子：其实就是该组随机数的编号，在需要重复试验的时候，保证得到一组一样的随机数。比如你每次都填1，其他参数一样的情况下你得到的随机数组是一样的。但填0或不填，每次都会不一样。

# stratify是为了保持split前, 数据的分布。(分层)
    # 比如有100个数据，80个属于A类，20个属于B类。(比例 4:1)
    # 如果train_test_split(... test_size=0.25, stratify = y_all), 那么split之后数据如下：
    # training: 75个数据，其中60个属于A类，15个属于B类。(比例 4:1)
    # testing: 25个数据，其中20个属于A类，5个属于B类。(比例 4:1)
    # 用了stratify参数，training集和testing集的A/B类的比例保持 A：B= 4：1，等同于split前的比例（80：20）。通常在这种类分布不平衡的情况下会用到stratify。
# 将stratify=X就是按照X中的类别比例分配
# 将stratify=y就是按照y中的类别比例分配

## 归一化

In [None]:
from sklearn.preprocessing import StandardScaler

# 采用Z-Score规范化  均值0, 标准差1的正态分布
ss = StandardScaler()
train_ss_x = ss.fit_transform(train_x)
test_ss_x = ss.transform(test_x)

In [None]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler(feature_range=(0, 1))
X_train = mms.fit_transform(X_train)
X_valid = mms.fit_transform(X_valid)
test = mms.fit_transform(test)

## 对于分类特征进行特征值编码

In [None]:
from sklearn.preprocessing import LabelEncoder

# 对于分类特征进行特征值编码
attr = [
    "Age",
    "BusinessTravel",
    "Department",
    "Education",
    "EducationField",
    "Gender",
    "JobRole",
    "MaritalStatus",
    "Over18",
    "OverTime",
]
lbe_list = []
for feature in attr:
    lbe = LabelEncoder()
    train[feature] = lbe.fit_transform(train[feature])
    test[feature] = lbe.transform(test[feature])
    lbe_list.append(lbe)

# 选择模型 & 训练 & 预测

## LR

In [None]:
from sklearn.linear_model.logistic import LogisticRegression

model = LogisticRegression(max_iter=100, verbose=True, random_state=33, tol=1e-4)

# 训练模型
model.fit(X_train, y_train)

# 预测, 结果只取lable为1的概率. 默认会返回每行(每个样本)预测为 0 & 1的概率.
predict = model.predict_proba(test)[:, 1]

## svm

In [None]:
from sklearn.svm import SVC, LinearSVC

# 非线性
model = SVC(
    kernel="rbf",
    gamma="auto",
    max_iter=1000,
    random_state=33,
    verbose=True,
    tol=1e-5,
    cache_size=50000,
)


# 线性
model = LinearSVC(max_iter=1000, random_state=33, verbose=True,)


model.fit(X_train, y_train)
predict = model.predict(test)

# predict = model.predict_proba(test)[:, 1]

test_pd["Attrition"] = predict

# 转化为二分类输出
# test_pd['Attrition']=test_pd['Attrition'].map(lambda x:1 if x>=0.5 else 0)

test_pd[["Attrition"]].to_csv("submit_svc.csv")

## GBDT

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# 采用回归算法，可以得到更好的AUC结果
model = GradientBoostingRegressor(random_state=10)
model.fit(X_train, y_train)
predict = model.predict(test)
# print(predict)

test['Attrition']=predict
#print(predict)

# 转化为二分类输出
#test['Attrition']=test['Attrition'].map(lambda x:1 if x>=0.5 else 0)
test[['Attrition']].to_csv('submit_gbdt.csv')

## DesisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

# dicisionTree更适合"异质化数据"
tree_model = DecisionTreeClassifier()
tree_model.fit(train_ss_x, train_y)
print(tree_model)

predict_y = tree_model.predict(test_ss_x)


# 评估

In [None]:
from sklearn.metrics import accuracy_score

print("DecisionTree 准确率: %0.4lf" % accuracy_score(predict_y, test_y))