In [7]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer #我们使用TF-IDF
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report 
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack, csr_matrix #引入必要的库

def extract_features(url): #结构化特征，优化使用
    features = {} #创建空字典
    url = str(url) #确保url都是字符串
    features['url_length'] = len(url) # 看长度
    features['count_dots'] = url.count('.') #看点的数量
    features['count_hyphens'] = url.count('-') #看横杠的数量
    suspicious_keywords = ['login', 'secure', 'account', 'verify', 'password', 'update', 'banking'] #看敏感词在恶意网站中的敏感词
    features['count_suspicious_words'] = sum( #这里是数组的原因是我们的url不止一个
        1 for kw in suspicious_keywords if kw in url.lower()
    ) #遍历一遍看有多少敏感词，统计出来个数2
    return features

data=pd.read_csv('E:/作业/杂物/数据集/111_fixed.csv',encoding='ISO-8859-1')
data = data.dropna(subset=['url', 'label'])
data = data[data['url'].astype(str).str.strip() != ''] #数据清洗
x=data['url']
y=data['label'] #加载数据
xx = pd.DataFrame([extract_features(u) for u in x])
vec=TfidfVectorizer() #创建向量器
x_vec=vec.fit_transform(x) #转化x

scaler = MinMaxScaler()
structured_train_scaled = scaler.fit_transform(xx) #防止数据特征本身太大或太小太影响结果判断转换成介于(0,1)
structured_train_sparse = csr_matrix(structured_train_scaled) #转换成稀疏矩阵进行拼接因为前面 x_vec（TF-IDF 特征）就是稀疏矩阵必须跟它格式一样才能拼接！
x_final = hstack([x_vec, structured_train_sparse]) #合并数据将特征和数据本身进行合并变成一个

bys=MultinomialNB()
bys.fit(x_final,y) #训练贝叶斯模型

data2=pd.read_csv('E:/作业/杂物/数据集/balanced_test_set.csv', encoding='gb18030')
x1=data2['url']
y1=data2['label'] #读取测试数据
xx1 = pd.DataFrame([extract_features(u) for u in x1])
x1_vec = vec.transform(x1) #测试数据也要使用相同向量器进行转化

structured_scaled2 = scaler.transform(xx1) #不用再fit了！！！我们应当使用之前的规则
structured_sparse2 = csr_matrix(structured_scaled2) #测试集数据也应该这样做
x_final2 = hstack([x1_vec, structured_sparse2]) 

y_predict = bys.predict(x_final2) #得到预测结果

report=classification_report(y1,y_predict) #判断准确度
print(f"正确率是：{report}")

正确率是：              precision    recall  f1-score   support

           0       0.91      0.68      0.77     50000
           1       0.74      0.93      0.83     50000

    accuracy                           0.80    100000
   macro avg       0.82      0.80      0.80    100000
weighted avg       0.82      0.80      0.80    100000

