# Action1 男女声音识别

In [4]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [5]:
dataframe = pd.read_csv('/Volumes/Library/SynologyDrive/data/AI_Cheats/voice/voice.csv')
dataframe.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,male
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male


In [6]:
# 缺失值个数
print(dataframe.isnull().sum())
# 矩阵的大小
print(dataframe.shape)

meanfreq    0
sd          0
median      0
Q25         0
Q75         0
IQR         0
skew        0
kurt        0
sp.ent      0
sfm         0
mode        0
centroid    0
meanfun     0
minfun      0
maxfun      0
meandom     0
mindom      0
maxdom      0
dfrange     0
modindx     0
label       0
dtype: int64
(3168, 21)


In [7]:
print('男性个数:{}'.format(dataframe[dataframe.label=='male'].shape[0]))
print('女性个数:{}'.format(dataframe[dataframe.label=='female'].shape[0]))

男性个数:1584
女性个数:1584


In [8]:
# 提取特征列，目标列 target
X = dataframe.iloc[:, :-1]
y = dataframe.iloc[:, -1]

In [9]:
# 使用标签编码
gender_encoder = LabelEncoder()
print(y)
y = gender_encoder.fit_transform(y)
print(y)

0         male
1         male
2         male
3         male
4         male
         ...  
3163    female
3164    female
3165    female
3166    female
3167    female
Name: label, Length: 3168, dtype: object
[1 1 1 ... 0 0 0]


In [10]:
# 数据归一化
scaler = StandardScaler()
# 对原时特征进行归一化
X = scaler.fit_transform(X)
print(X)


[[-4.04924806  0.4273553  -4.22490077 ... -1.43142165 -1.41913712
  -1.45477229]
 [-3.84105325  0.6116695  -3.99929342 ... -1.41810716 -1.4058184
  -1.01410294]
 [-3.46306647  1.60384791 -4.09585052 ... -1.42920257 -1.41691733
  -1.06534356]
 ...
 [-1.29877326  2.32272355 -0.05197279 ... -0.5992661  -0.58671739
   0.17588664]
 [-1.2452018   2.012196   -0.01772849 ... -0.41286326 -0.40025537
   1.14916112]
 [-0.51474626  2.14765111 -0.07087873 ... -1.27608595 -1.2637521
   1.47567886]]


In [11]:
param = {
    'boosting_type': 'gbdt',
    'objective': 'binary:logistic', #
    'eval_metric': 'auc',
    'eta': 0.1,
    'max_depth': 1,
    'colsample_bytree': 0.8,
    'subsample': 0.9,
    'subsample_freq': 7,
    'alpha': 0,
    'lambda': 0,
}

In [12]:
# 数据集切分
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=2023)

In [13]:
train_data = xgb.DMatrix(X_train, label=y_train)
test_data = xgb.DMatrix(X_test, label=y_test)

In [14]:
model = xgb.train(param, train_data, evals=[(train_data, 'train'), (test_data, 'valid')], num_boost_round = 100000, early_stopping_rounds=100, verbose_eval=5)
y_pred = model.predict(test_data)
y_pred = [1 if x>=0.5 else 0 for x in y_pred]
print('XGBoost 预测结果：{}, \n 准确率: {}'.format(y_test, accuracy_score(y_test, y_test)))

[0]	train-auc:0.95815	valid-auc:0.95209
[5]	train-auc:0.97504	valid-auc:0.96278
[10]	train-auc:0.98285	valid-auc:0.97298
[15]	train-auc:0.99263	valid-auc:0.98828
[20]	train-auc:0.99390	valid-auc:0.98890
[25]	train-auc:0.99404	valid-auc:0.98902
[30]	train-auc:0.99436	valid-auc:0.98927
[35]	train-auc:0.99535	valid-auc:0.99090
[40]	train-auc:0.99553	valid-auc:0.99108
[45]	train-auc:0.99639	valid-auc:0.99221
[50]	train-auc:0.99659	valid-auc:0.99242
[55]	train-auc:0.99672	valid-auc:0.99265
[60]	train-auc:0.99705	valid-auc:0.99297
[65]	train-auc:0.99729	valid-auc:0.99300


[70]	train-auc:0.99751	valid-auc:0.99350


Parameters: { "boosting_type", "subsample_freq" } are not used.



[75]	train-auc:0.99763	valid-auc:0.99349
[80]	train-auc:0.99776	valid-auc:0.99371
[85]	train-auc:0.99790	valid-auc:0.99379
[90]	train-auc:0.99815	valid-auc:0.99380
[95]	train-auc:0.99823	valid-auc:0.99384
[100]	train-auc:0.99832	valid-auc:0.99402
[105]	train-auc:0.99836	valid-auc:0.99402
[110]	train-auc:0.99847	valid-auc:0.99411
[115]	train-auc:0.99858	valid-auc:0.99424
[120]	train-auc:0.99876	valid-auc:0.99426
[125]	train-auc:0.99886	valid-auc:0.99420
[130]	train-auc:0.99884	valid-auc:0.99412
[135]	train-auc:0.99899	valid-auc:0.99416
[140]	train-auc:0.99913	valid-auc:0.99420
[145]	train-auc:0.99920	valid-auc:0.99414
[150]	train-auc:0.99936	valid-auc:0.99420
[155]	train-auc:0.99937	valid-auc:0.99424
[160]	train-auc:0.99942	valid-auc:0.99424
[165]	train-auc:0.99953	valid-auc:0.99429
[170]	train-auc:0.99954	valid-auc:0.99431
[175]	train-auc:0.99957	valid-auc:0.99435
[180]	train-auc:0.99959	valid-auc:0.99428
[185]	train-auc:0.99964	valid-auc:0.99430
[190]	train-auc:0.99968	valid-auc:0.994