##### 导入包

In [1]:
import os
import zipfile
import pandas as pd
import gc
import glob
from sqlalchemy import create_engine
from datetime import timedelta
from sklearn.model_selection import train_test_split

##### 读取数据集

In [8]:
# 读取2018年12月磁盘数据集
# 读取csv文件
smartlog_data_df = pd.read_csv('smartlog_data_201812.csv')
# 删除空值
smartlog_data_df = smartlog_data_df.dropna(axis=1, how='all')

In [9]:
#读取错误标签数据集
fault_tag_df = pd.read_csv('fault_tag_data.csv')
fault_tag_df = fault_tag_df.sort_values('fault_time')
fault_tag_df['fault_time'] = pd.to_datetime(fault_tag_df['fault_time'])
# 将同一个模型的数据归结到一起
fault_tag_A_df = fault_tag_df[fault_tag_df['model'] == 1]
fault_tag_B_df = fault_tag_df[fault_tag_df['model'] == 2]
fault_tag_A_df = fault_tag_A_df[['serial_number','fault_time']]
fault_tag_B_df = fault_tag_B_df[['serial_number','fault_time']]
fault_tag_A_df

Unnamed: 0,serial_number,fault_time
2320,disk_125207,2017-07-02
1214,disk_133302,2017-07-04
2004,disk_72870,2017-07-05
2384,disk_19440,2017-07-05
2326,disk_127633,2017-07-06
...,...,...
37,disk_114679,2018-12-31
509,disk_113846,2018-12-31
2298,disk_114247,2018-12-31
566,disk_143012,2018-12-31


In [10]:
smartlog_data_A_df = smartlog_data_df[smartlog_data_df['model'] == 1]
smartlog_data_B_df = smartlog_data_df[smartlog_data_df['model'] == 2]
start_date = pd.to_datetime('2018-12-26')
end_date = pd.to_datetime('2018-12-31')
# 筛选健康磁盘信息
smartlog_data_A_df = smartlog_data_A_df[~smartlog_data_A_df['serial_number'].isin(fault_tag_A_df['fault_time'])]
smartlog_data_B_df = smartlog_data_B_df[~smartlog_data_B_df['serial_number'].isin(fault_tag_B_df['fault_time'])]
smartlog_data_A_df['dt'] = pd.to_datetime(smartlog_data_A_df['dt'], format='%Y%m%d')
smartlog_data_B_df['dt'] = pd.to_datetime(smartlog_data_B_df['dt'], format='%Y%m%d')
smartlog_data_A_df = smartlog_data_A_df[(smartlog_data_A_df['dt'] >= start_date) & (smartlog_data_A_df['dt'] <= end_date)]
smartlog_data_B_df = smartlog_data_B_df[(smartlog_data_B_df['dt'] >= start_date) & (smartlog_data_B_df['dt'] <= end_date)]
# 将健康磁盘的数据与故障磁盘数据数目保持一直
N = 50000  # 用你想要的磁盘数量替换
unique_disks_A = smartlog_data_A_df['serial_number'].unique()[:N]
unique_disks_B = smartlog_data_B_df['serial_number'].unique()[:N]
smartlog_data_A_df = smartlog_data_A_df[smartlog_data_A_df['serial_number'].isin(unique_disks_A)]
smartlog_data_B_df = smartlog_data_B_df[smartlog_data_B_df['serial_number'].isin(unique_disks_B)]
# 选取属性进行测试
smartlog_data_A_df = smartlog_data_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt']]
smartlog_data_B_df = smartlog_data_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt']]
smartlog_data_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt
0,disk_59274,1,0.0,13.0,12.0,13.0,100.0,95.0,2018-12-26
5,disk_59284,1,0.0,14.0,13.0,14.0,100.0,94.0,2018-12-29
9,disk_59287,1,0.0,13.0,12.0,13.0,100.0,93.0,2018-12-29
10,disk_59289,1,0.0,13.0,12.0,13.0,100.0,96.0,2018-12-28
13,disk_59294,1,0.0,29.0,28.0,29.0,100.0,93.0,2018-12-26
...,...,...,...,...,...,...,...,...,...
4100918,disk_60284,1,0.0,12.0,11.0,12.0,100.0,93.0,2018-12-26
4100932,disk_60304,1,0.0,14.0,13.0,14.0,100.0,94.0,2018-12-28
4100935,disk_60312,1,0.0,11.0,10.0,11.0,100.0,94.0,2018-12-27
4100941,disk_60331,1,0.0,15.0,14.0,15.0,100.0,93.0,2018-12-27


In [11]:
print(len(smartlog_data_A_df))
print(len(smartlog_data_B_df))

284148
296150


In [12]:
# 清内存
del smartlog_data_df
gc.collect()

0

In [None]:
# 将数据存储到CSV文件中去
smartlog_data_A_df.to_csv('smartlog_data_A_df.csv', index=False)
smartlog_data_B_df.to_csv('smartlog_data_B_df.csv', index=False)