In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('voice.csv')
male=df.iloc[:1583,:]                #0-1583为male
male_x1=male['IQR']
male_x2=male['meanfun']
 
female=df.iloc[1584:,:]              #1584之后为female
female_x1=female['IQR']
female_x2=female['meanfun']
plt.figure()
plt.scatter(male_x1,male_x2,c='b',alpha=0.5,label='male')
plt.scatter(female_x1,female_x2,c='r',alpha=0.5,marker="p",label='female')
plt.xlabel('IQR')
plt.ylabel('meanfun')
plt.legend(loc='upper right')    
 
plt.show()


In [None]:
x = df.iloc[:,[5,12]]       ## 选择第6、13列
y = df.label 
x.head()

In [None]:
#使用一个0-1变量（哑变量）进行替代male和female两个属性
from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(y)

In [None]:
#将数据集拆分为训练和测试，测试大小为20%
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)


###数据归一化
from sklearn.preprocessing import StandardScaler
scaler1 = StandardScaler()
scaler1.fit(x_train)
x_train = scaler1.transform(x_train)
x_test = scaler1.transform(x_test)



In [None]:
from sklearn.metrics import classification_report,accuracy_score
###1) Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
DTmodel = DecisionTreeClassifier(random_state = 1)
DTmodel.fit(x_train,y_train)
y_pred1 = DTmodel.predict(x_test)

###2) Random Forest 
from sklearn.ensemble import RandomForestClassifier
RFmodel = RandomForestClassifier(random_state = 200)
RFmodel.fit(x_train,y_train)
y_pred2 = RFmodel.predict(x_test)

###3)Logistic Regression
from sklearn.linear_model import LogisticRegression
LRmodel = LogisticRegression(max_iter=10000)
LRmodel.fit(x_train,y_train)
y_pred3 = LRmodel.predict(x_test)

###4) KNN Classifier
from sklearn.neighbors import KNeighborsClassifier
KNmodel = KNeighborsClassifier(n_neighbors =15, metric = 'minkowski', p = 1)
KNmodel.fit(x_train, y_train)
y_pred4 = KNmodel.predict(x_test)

###5) SVM Model
from sklearn.svm import SVC
SVMmodel = SVC(kernel = 'linear',probability=True)
SVMmodel.fit(x_train,y_train)
y_pred5 = SVMmodel.predict(x_test)


In [None]:
print("Decision Tree Accuracy score: ",accuracy_score(y_pred1,y_test)*100,"%")
print('score of Random forest model is: ',RFmodel.score(x_test,y_test))
print("Random Forest Accuracy score: ",accuracy_score(y_pred2,y_test)*100,"%")
print("Log Reg Accuracy score: ",accuracy_score(y_pred3,y_test)*100,"%")
print("KNN Accuracy score: ",accuracy_score(y_pred4,y_test)*100,"%")
print("SVM Accuracy score: ",accuracy_score(y_pred5,y_test)*100,"%")

In [None]:
from sklearn.metrics import roc_curve, auc  
import matplotlib.pyplot as plt

#plt.figure()
fig = plt.figure(figsize =(10, 7))
###############画DTmodel的ROC-AUC曲线########################
prob_predict_y_validation_DTmodel = DTmodel.predict_proba(x_test)#给出带有概率值的结果，每个点所有label的概率和为1
predictions_validation_DTmodel = prob_predict_y_validation_DTmodel[:, 1]  
fpr_DTmodel, tpr_DTmodel, _ = roc_curve(y_test, predictions_validation_DTmodel) 
roc_auc_DTmodel = auc(fpr_DTmodel, tpr_DTmodel)  
plt.plot(fpr_DTmodel, tpr_DTmodel, 'b', label='DTmodel = %0.2f' % roc_auc_DTmodel) 

###############画rf的ROC-AUC曲线########################

prob_predict_y_validation_RFmodel = RFmodel.predict_proba(x_test)#给出带有概率值的结果，每个点所有label的概率和为1
predictions_validation_RFmodel = prob_predict_y_validation_RFmodel[:, 1]  
fpr_RFmodel, tpr_RFmodel, _ = roc_curve(y_test, predictions_validation_RFmodel) 
roc_auc_RFmodel = auc(fpr_RFmodel, tpr_RFmodel)  
plt.plot(fpr_RFmodel, tpr_RFmodel, '#9932CC', label='RFmodel = %0.2f' % roc_auc_RFmodel) 

###############画logistic的ROC-AUC曲线########################

prob_predict_y_validation_LRmodel = LRmodel.predict_proba(x_test) #给出带有概率值的结果，每个点所有label的概率和为1
predictions_validation_LRmodel = prob_predict_y_validation_LRmodel[:, 1]  
fpr_LRmodel, tpr_LRmodel, _ = roc_curve(y_test, predictions_validation_LRmodel) 
roc_auc_LRmodel = auc(fpr_LRmodel, tpr_LRmodel)  
plt.plot(fpr_LRmodel, tpr_LRmodel, 'g', label='LRmodel = %0.2f' % roc_auc_LRmodel) 


###############画KNN的ROC-AUC曲线########################

prob_predict_y_validation_KNmodel = KNmodel.predict_proba(x_test)#给出带有概率值的结果，每个点所有label的概率和为1
predictions_validation_KNmodel = prob_predict_y_validation_KNmodel[:, 1]  
fpr_KNmodel, tpr_KNmodel, _ = roc_curve(y_test, predictions_validation_KNmodel) 
roc_auc_KNmodel = auc(fpr_KNmodel, tpr_KNmodel)  
plt.plot(fpr_KNmodel, tpr_KNmodel, 'y', label='KNmodel = %0.2f' % roc_auc_KNmodel)

###############画SVM的ROC-AUC曲线########################

prob_predict_y_validation_SVMmodel = SVMmodel.predict_proba(x_test)#给出带有概率值的结果，每个点所有label的概率和为1
predictions_validation_SVMmodel = prob_predict_y_validation_SVMmodel[:, 1]  
fpr_SVMmodel, tpr_SVMmodel, _ = roc_curve(y_test, predictions_validation_SVMmodel) 
roc_auc_SVMmodel= auc(fpr_SVMmodel, tpr_SVMmodel)  
plt.plot(fpr_SVMmodel, tpr_SVMmodel, 'm', label='SVMmodel = %0.2f' % roc_auc_SVMmodel) 


###############################roc auc公共设置##################################
plt.title('ROC Validation')  
plt.legend(loc='lower right')  
plt.plot([0, 1], [0, 1], 'r--')  
plt.xlim([0, 1])  
plt.ylim([0, 1])  
plt.ylabel('True Positive Rate')  
plt.xlabel('False Positive Rate') 


In [None]:
list1 = [y_pred1,y_pred2,y_pred3,y_pred4,y_pred5]
d =['DecTree','RandomForest','Log Regression','KNN','SuppVecMachine']
a={} 
k=0
list2 = []
for i in list1:
    list2.append(accuracy_score(i,y_test)*100)
for i in d:
    a[i] = list2[k]
    k+=1

In [None]:
print("List of all model accuracies:\n",a)

In [None]:
print("the most accurate model is:", max(a, key=a.get))