### 样本类别分类不均衡问题
采用Python的imbalanced-learn包处理不平衡数据，采用sklearn的SVM调整类别权重处理样本不均衡问题

In [9]:
import pandas as pd
from imblearn.over_sampling import SMOTE    # 过抽样处理库SMOTE
from imblearn.under_sampling import RandomUnderSampler    # 欠抽样处理库
from sklearn.svm import SVC    # 调整算法中的分类权重

In [7]:
df = pd.read_table('..\dataset\data2.txt', sep=' ', names=['col1', 'col2', 'col3', 'col4', 'col5', 'label'])
x, y = df.iloc[:, :-1], df.iloc[:, -1]
groupby_data_original = df.groupby('label').count()    # 对label做分类汇总，查看分类数据是否均衡
print(groupby_data_original)    # 正样本数量较少，占总样本量的5.8%，属于严重不均衡分布

       col1  col2  col3  col4  col5
label                              
0.0     942   942   942   942   942
1.0      58    58    58    58    58


In [13]:
# 采用SMOTE方法进行过抽样处理
model_smote = SMOTE()
# 输入数据并做过抽样处理
x_smote_resampled, y_smote_resampled = model_smote.fit_sample(x, y)
x_smote_resampled = pd.DataFrame(x_smote_resampled, columns=['col1', 'col2', 'col3', 'col4', 'col5'])
y_smote_resampled = pd.DataFrame(y_smote_resampled, columns=['label'])
smote_resampled = pd.concat([x_smote_resampled, y_smote_resampled], axis=1)
groupby_data_smote = smote_resampled.groupby('label').count()
print(groupby_data_smote)
# smote_resampled

       col1  col2  col3  col4  col5
label                              
0.0     942   942   942   942   942
1.0     942   942   942   942   942


In [15]:
# 采用RandomUnderSample方法进行欠抽样处理
model_rus = RandomUnderSampler()
x_rus_resampled, y_rus_resampled = model_rus.fit_sample(x, y)
x_rus_resampled = pd.DataFrame(x_rus_resampled, columns=['col1', 'col2', 'col3', 'col4', 'col5'])
y_rus_resampled = pd.DataFrame(y_rus_resampled,columns=['label'])
rus_resampled = pd.concat([x_rus_resampled, y_rus_resampled], axis=1)
groupby_data_rus = rus_resampled.groupby('label').count()
print(groupby_data_rus)
# rus_resampled

       col1  col2  col3  col4  col5
label                              
0.0      58    58    58    58    58
1.0      58    58    58    58    58


In [17]:
# 采用SVM的权重调节处理不均衡样本
model_svm = SVC(class_weight='balanced', gamma='scale')
model_svm.fit(x, y)

model_svm

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)