#### 导入包

In [1]:
import os
import zipfile
import pandas as pd
import gc
import glob
from sqlalchemy import create_engine
from datetime import timedelta
from sklearn.model_selection import train_test_split

#### 读取数据集

In [2]:
#读取错误标签数据集
fault_tag_df = pd.read_csv('fault_tag_data.csv')
fault_tag_df = fault_tag_df.sort_values('fault_time')
fault_tag_df['fault_time'] = pd.to_datetime(fault_tag_df['fault_time'])
# 将同一个模型的数据归结到一起
fault_tag_A_df = fault_tag_df[fault_tag_df['model'] == 1]
fault_tag_B_df = fault_tag_df[fault_tag_df['model'] == 2]
fault_tag_A_df = fault_tag_A_df[['serial_number','fault_time']]
fault_tag_B_df = fault_tag_B_df[['serial_number','fault_time']]
fault_tag_A_df

Unnamed: 0,serial_number,fault_time
2320,disk_125207,2017-07-02
1214,disk_133302,2017-07-04
2004,disk_72870,2017-07-05
2384,disk_19440,2017-07-05
2326,disk_127633,2017-07-06
...,...,...
37,disk_114679,2018-12-31
509,disk_113846,2018-12-31
2298,disk_114247,2018-12-31
566,disk_143012,2018-12-31


In [21]:
# 读取2017年07月磁盘数据集
# 读取csv文件
smartlog_data_201707_fault_df = pd.read_csv('smartlog_data_201707.csv')
# 删除空值
smartlog_data_201707_fault_df = smartlog_data_201707_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201707_fault_df['dt'] = pd.to_datetime(smartlog_data_201707_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201707_fault_A_df = smartlog_data_201707_fault_df[smartlog_data_201707_fault_df['model'] == 1]
smartlog_data_201707_fault_B_df = smartlog_data_201707_fault_df[smartlog_data_201707_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201707_fault_A_df = smartlog_data_201707_fault_A_df[smartlog_data_201707_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201707_fault_B_df = smartlog_data_201707_fault_B_df[smartlog_data_201707_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201707_fault_A_df = pd.merge(smartlog_data_201707_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201707_fault_B_df = pd.merge(smartlog_data_201707_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201707_fault_A_df['first_observation_time'] = smartlog_data_201707_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201707_fault_B_df['first_observation_time'] = smartlog_data_201707_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201707_fault_A_df['time_diff'] = (smartlog_data_201707_fault_A_df['fault_time'] - smartlog_data_201707_fault_A_df['first_observation_time']).dt.days
smartlog_data_201707_fault_B_df['time_diff'] = (smartlog_data_201707_fault_B_df['fault_time'] - smartlog_data_201707_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201707_fault_A_df = smartlog_data_201707_fault_A_df[smartlog_data_201707_fault_A_df['time_diff'] >= 0]
smartlog_data_201707_fault_B_df = smartlog_data_201707_fault_B_df[smartlog_data_201707_fault_B_df['time_diff'] >= 0]
# 选取属性进行测试
smartlog_data_201707_fault_A_df = smartlog_data_201707_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201707_fault_B_df = smartlog_data_201707_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201707_fault_df
gc.collect()
smartlog_data_201707_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_69492,1,0.0,5.0,4.0,5.0,100.0,90.0,2017-07-22,2017-10-27,2017-07-09,110
1,disk_69492,1,0.0,5.0,4.0,5.0,100.0,90.0,2017-07-19,2017-10-27,2017-07-09,110
2,disk_69492,1,0.0,5.0,4.0,5.0,100.0,90.0,2017-07-29,2017-10-27,2017-07-09,110
3,disk_69492,1,0.0,5.0,4.0,5.0,100.0,90.0,2017-07-16,2017-10-27,2017-07-09,110
4,disk_69492,1,0.0,5.0,4.0,5.0,100.0,90.0,2017-07-20,2017-10-27,2017-07-09,110
...,...,...,...,...,...,...,...,...,...,...,...,...
16866,disk_55811,1,0.0,11.0,10.0,11.0,100.0,93.0,2017-07-30,2018-12-06,2017-07-28,496
16867,disk_55811,1,0.0,11.0,10.0,11.0,100.0,93.0,2017-07-28,2018-12-06,2017-07-28,496
16868,disk_127134,1,0.0,5.0,4.0,5.0,100.0,90.0,2017-07-28,2018-09-08,2017-07-28,407
16869,disk_127134,1,0.0,5.0,4.0,5.0,100.0,90.0,2017-07-31,2018-09-08,2017-07-28,407


In [22]:
smartlog_data_201707_fault_B_df

Unnamed: 0,serial_number,model,smart_9_normalized,smart_191_normalized,smart_192raw,smart_195_normalized,smart_5raw,smart_199raw,smart_190_normalized,smart_188raw,...,smart_191raw,smart_7_normalized,smart_9raw,smart_7raw,smart_242raw,smart_241raw,dt,fault_time,first_observation_time,time_diff
0,disk_52458,2,98.0,100.0,12.0,100.0,0.0,0.0,70.0,0.0,...,202.0,73.0,2255.0,19019324.0,25054164.0,4.957761e+08,2017-07-09,2018-07-18,2017-07-09,374
1,disk_52458,2,97.0,100.0,12.0,100.0,0.0,0.0,69.0,0.0,...,202.0,73.0,2663.0,19788694.0,25054164.0,4.957761e+08,2017-07-26,2018-07-18,2017-07-09,374
2,disk_52458,2,98.0,100.0,12.0,100.0,0.0,0.0,70.0,0.0,...,202.0,73.0,2327.0,19155104.0,25054164.0,4.957761e+08,2017-07-12,2018-07-18,2017-07-09,374
3,disk_52458,2,98.0,100.0,12.0,100.0,0.0,0.0,69.0,0.0,...,202.0,73.0,2447.0,19381869.0,25054164.0,4.957761e+08,2017-07-17,2018-07-18,2017-07-09,374
4,disk_52458,2,98.0,100.0,12.0,100.0,0.0,0.0,70.0,0.0,...,202.0,73.0,2399.0,19291084.0,25054164.0,4.957761e+08,2017-07-15,2018-07-18,2017-07-09,374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3559,disk_131817,2,100.0,100.0,14.0,10.0,0.0,0.0,73.0,0.0,...,49.0,71.0,77.0,13847248.0,19805210.0,9.551698e+07,2017-07-22,2018-10-26,2017-07-22,461
3560,disk_58222,2,100.0,100.0,11.0,2.0,0.0,0.0,73.0,0.0,...,2.0,72.0,242.0,14128824.0,16467152.0,5.754016e+06,2017-07-22,2018-07-03,2017-07-21,347
3561,disk_58222,2,100.0,100.0,11.0,2.0,0.0,0.0,73.0,0.0,...,2.0,71.0,218.0,14105101.0,16467152.0,5.754016e+06,2017-07-21,2018-07-03,2017-07-21,347
3562,disk_58222,2,100.0,99.0,11.0,30.0,110.0,0.0,66.0,0.0,...,2143.0,74.0,441.0,25199238.0,402472247.0,3.126624e+10,2017-07-31,2018-07-03,2017-07-21,347


In [23]:
# 查看201707每个故障磁盘有多少条数据
# 筛选总共有多少个磁盘名称
unique_disk_fault_A_201707_names = smartlog_data_201707_fault_A_df['serial_number'].unique()
unique_disk_fault_B_201707_names = smartlog_data_201707_fault_B_df['serial_number'].unique()
counts_fault_A_201707 = {}
counts_fault_B_201707 = {}
# 查找
for name in unique_disk_fault_A_201707_names:
    counts_fault_A_201707[name] = len(smartlog_data_201707_fault_A_df[smartlog_data_201707_fault_A_df['serial_number'] == name])
for name in unique_disk_fault_B_201707_names:
    counts_fault_B_201707[name] = len(smartlog_data_201707_fault_B_df[smartlog_data_201707_fault_B_df['serial_number'] == name])
# 打印
print(counts_fault_B_201707)
print(counts_fault_A_201707)


{'disk_52458': 16, 'disk_53124': 17, 'disk_53537': 17, 'disk_54194': 17, 'disk_54781': 15, 'disk_56104': 10, 'disk_57810': 19, 'disk_57894': 7, 'disk_58740': 3, 'disk_58805': 7, 'disk_58981': 7, 'disk_118008': 11, 'disk_118036': 17, 'disk_118353': 17, 'disk_118617': 14, 'disk_119050': 18, 'disk_119539': 14, 'disk_121338': 18, 'disk_121682': 17, 'disk_122368': 19, 'disk_122646': 18, 'disk_122766': 7, 'disk_122984': 15, 'disk_123887': 20, 'disk_124576': 17, 'disk_125154': 17, 'disk_126659': 17, 'disk_128734': 18, 'disk_128840': 18, 'disk_128986': 13, 'disk_129643': 5, 'disk_130816': 4, 'disk_130916': 11, 'disk_131129': 17, 'disk_132927': 5, 'disk_45353': 17, 'disk_45939': 17, 'disk_45957': 12, 'disk_46139': 12, 'disk_46148': 17, 'disk_46479': 13, 'disk_47067': 17, 'disk_47375': 13, 'disk_47510': 17, 'disk_47794': 18, 'disk_47904': 17, 'disk_47944': 13, 'disk_47990': 18, 'disk_48083': 17, 'disk_48111': 17, 'disk_48302': 18, 'disk_48786': 8, 'disk_49001': 10, 'disk_49268': 8, 'disk_49324':

In [19]:
# 读取2017年08月磁盘数据集
# 读取csv文件
smartlog_data_201708_fault_df = pd.read_csv('smartlog_data_201708.csv')
# 删除空值
smartlog_data_201708_fault_df = smartlog_data_201708_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201708_fault_df['dt'] = pd.to_datetime(smartlog_data_201708_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201708_fault_A_df = smartlog_data_201708_fault_df[smartlog_data_201708_fault_df['model'] == 1]
smartlog_data_201708_fault_B_df = smartlog_data_201708_fault_df[smartlog_data_201708_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201708_fault_A_df = smartlog_data_201708_fault_A_df[smartlog_data_201708_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201708_fault_B_df = smartlog_data_201708_fault_B_df[smartlog_data_201708_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201708_fault_A_df = pd.merge(smartlog_data_201708_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201708_fault_B_df = pd.merge(smartlog_data_201708_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201708_fault_A_df['first_observation_time'] = smartlog_data_201708_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201708_fault_B_df['first_observation_time'] = smartlog_data_201708_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201708_fault_A_df['time_diff'] = (smartlog_data_201708_fault_A_df['fault_time'] - smartlog_data_201708_fault_A_df['first_observation_time']).dt.days
smartlog_data_201708_fault_B_df['time_diff'] = (smartlog_data_201708_fault_B_df['fault_time'] - smartlog_data_201708_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201708_fault_A_df = smartlog_data_201708_fault_A_df[(smartlog_data_201708_fault_A_df['time_diff'] >= 0) | (smartlog_data_201708_fault_A_df['fault_time'] <= '2017-08-05')]
smartlog_data_201708_fault_B_df = smartlog_data_201708_fault_B_df[(smartlog_data_201708_fault_B_df['time_diff'] >= 0) | (smartlog_data_201708_fault_B_df['fault_time'] <= '2017-08-05')]
# 选取属性进行测试
smartlog_data_201708_fault_A_df = smartlog_data_201708_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201708_fault_B_df = smartlog_data_201708_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201708_fault_df
gc.collect()
smartlog_data_201708_fault_A_df


Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_100111,1,27.0,10.0,9.0,10.0,100.0,93.0,2017-08-07,2018-05-02,2017-08-01,274
1,disk_100111,1,34.0,10.0,9.0,10.0,100.0,93.0,2017-08-28,2018-05-02,2017-08-01,274
2,disk_100111,1,34.0,10.0,9.0,10.0,100.0,93.0,2017-08-31,2018-05-02,2017-08-01,274
3,disk_100111,1,27.0,10.0,9.0,10.0,100.0,93.0,2017-08-05,2018-05-02,2017-08-01,274
4,disk_100111,1,34.0,10.0,9.0,10.0,100.0,93.0,2017-08-29,2018-05-02,2017-08-01,274
...,...,...,...,...,...,...,...,...,...,...,...,...
28851,disk_138880,1,0.0,12.0,12.0,13.0,100.0,91.0,2017-08-02,2017-08-01,2017-08-01,0
28852,disk_138880,1,0.0,12.0,12.0,13.0,100.0,91.0,2017-08-01,2017-08-01,2017-08-01,0
28853,disk_91432,1,142.0,12.0,11.0,12.0,100.0,86.0,2017-08-01,2017-07-31,2017-08-01,-1
28854,disk_29894,1,0.0,6.0,5.0,6.0,100.0,81.0,2017-08-02,2017-08-03,2017-08-02,1


In [20]:
# 筛选总共有多少个磁盘名称
unique_disk_fault_A_201708_names = smartlog_data_201708_fault_A_df['serial_number'].unique()
unique_disk_fault_B_201708_names = smartlog_data_201708_fault_B_df['serial_number'].unique()
counts_fault_A_201708 = {}
counts_fault_B_201708 = {}
# 查找
for name in unique_disk_fault_A_201708_names:
    counts_fault_A_201708[name] = len(smartlog_data_201708_fault_A_df[smartlog_data_201708_fault_A_df['serial_number'] == name])
for name in unique_disk_fault_B_201708_names:
    counts_fault_B_201708[name] = len(smartlog_data_201708_fault_B_df[smartlog_data_201708_fault_B_df['serial_number'] == name])
# 打印
print(counts_fault_B_201708)
print(counts_fault_A_201708)

{'disk_49376': 31, 'disk_49447': 25, 'disk_49524': 31, 'disk_49551': 30, 'disk_49617': 27, 'disk_50389': 31, 'disk_50532': 30, 'disk_50686': 31, 'disk_50909': 30, 'disk_51173': 23, 'disk_51519': 15, 'disk_51685': 11, 'disk_51836': 31, 'disk_51931': 11, 'disk_51975': 30, 'disk_52433': 29, 'disk_52568': 29, 'disk_53716': 17, 'disk_53889': 25, 'disk_54006': 22, 'disk_54128': 22, 'disk_54194': 30, 'disk_54860': 24, 'disk_55360': 24, 'disk_56039': 30, 'disk_56104': 31, 'disk_56462': 31, 'disk_56470': 31, 'disk_56627': 29, 'disk_56701': 30, 'disk_56740': 31, 'disk_56968': 15, 'disk_57301': 10, 'disk_57314': 27, 'disk_57372': 31, 'disk_57942': 30, 'disk_57961': 30, 'disk_58222': 26, 'disk_58233': 10, 'disk_58373': 31, 'disk_58596': 29, 'disk_58740': 30, 'disk_58938': 28, 'disk_59119': 30, 'disk_59599': 15, 'disk_59652': 31, 'disk_59890': 28, 'disk_59948': 27, 'disk_60072': 29, 'disk_60275': 30, 'disk_60311': 26, 'disk_60402': 31, 'disk_60500': 27, 'disk_60555': 28, 'disk_60660': 31, 'disk_607

In [18]:
# 读取2017年09月磁盘数据集
# 读取csv文件
smartlog_data_201709_fault_df = pd.read_csv('smartlog_data_201709.csv')
# 删除空值
smartlog_data_201709_fault_df = smartlog_data_201709_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201709_fault_df['dt'] = pd.to_datetime(smartlog_data_201709_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201709_fault_A_df = smartlog_data_201709_fault_df[smartlog_data_201709_fault_df['model'] == 1]
smartlog_data_201709_fault_B_df = smartlog_data_201709_fault_df[smartlog_data_201709_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201709_fault_A_df = smartlog_data_201709_fault_A_df[smartlog_data_201709_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201709_fault_B_df = smartlog_data_201709_fault_B_df[smartlog_data_201709_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201709_fault_A_df = pd.merge(smartlog_data_201709_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201709_fault_B_df = pd.merge(smartlog_data_201709_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201709_fault_A_df['first_observation_time'] = smartlog_data_201709_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201709_fault_B_df['first_observation_time'] = smartlog_data_201709_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201709_fault_A_df['time_diff'] = (smartlog_data_201709_fault_A_df['fault_time'] - smartlog_data_201709_fault_A_df['first_observation_time']).dt.days
smartlog_data_201709_fault_B_df['time_diff'] = (smartlog_data_201709_fault_B_df['fault_time'] - smartlog_data_201709_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201709_fault_A_df = smartlog_data_201709_fault_A_df[(smartlog_data_201709_fault_A_df['time_diff'] >= 0) | (smartlog_data_201709_fault_A_df['fault_time'] <= '2017-09-05')]
smartlog_data_201709_fault_B_df = smartlog_data_201709_fault_B_df[(smartlog_data_201709_fault_B_df['time_diff'] >= 0) | (smartlog_data_201709_fault_B_df['fault_time'] <= '2017-09-05')]
# 选取属性进行测试
smartlog_data_201709_fault_A_df = smartlog_data_201709_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201709_fault_B_df = smartlog_data_201709_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201709_fault_df
gc.collect()
smartlog_data_201709_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_68474,1,796.0,11.0,10.0,11.0,96.0,91.0,2017-09-23,2018-04-22,2017-09-01,233
1,disk_68474,1,770.0,11.0,10.0,11.0,96.0,90.0,2017-09-18,2018-04-22,2017-09-01,233
2,disk_68474,1,728.0,11.0,10.0,11.0,96.0,90.0,2017-09-05,2018-04-22,2017-09-01,233
3,disk_68474,1,774.0,11.0,10.0,11.0,96.0,90.0,2017-09-19,2018-04-22,2017-09-01,233
4,disk_68474,1,762.0,11.0,10.0,11.0,96.0,90.0,2017-09-15,2018-04-22,2017-09-01,233
...,...,...,...,...,...,...,...,...,...,...,...,...
24648,disk_48558,1,0.0,26.0,21.0,23.0,100.0,93.0,2017-09-02,2017-09-04,2017-09-01,3
24649,disk_48558,1,0.0,26.0,21.0,23.0,100.0,93.0,2017-09-01,2017-09-04,2017-09-01,3
24650,disk_138123,1,0.0,34.0,33.0,34.0,100.0,81.0,2017-09-26,2018-03-21,2017-09-26,176
24651,disk_138123,1,0.0,34.0,33.0,34.0,100.0,81.0,2017-09-27,2018-03-21,2017-09-26,176


In [17]:
# 读取2017年10月磁盘数据集
# 读取csv文件
smartlog_data_201710_fault_df = pd.read_csv('smartlog_data_201710.csv')
# 删除空值
smartlog_data_201710_fault_df = smartlog_data_201710_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201710_fault_df['dt'] = pd.to_datetime(smartlog_data_201710_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201710_fault_A_df = smartlog_data_201710_fault_df[smartlog_data_201710_fault_df['model'] == 1]
smartlog_data_201710_fault_B_df = smartlog_data_201710_fault_df[smartlog_data_201710_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201710_fault_A_df = smartlog_data_201710_fault_A_df[smartlog_data_201710_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201710_fault_B_df = smartlog_data_201710_fault_B_df[smartlog_data_201710_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201710_fault_A_df = pd.merge(smartlog_data_201710_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201710_fault_B_df = pd.merge(smartlog_data_201710_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201710_fault_A_df['first_observation_time'] = smartlog_data_201710_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201710_fault_B_df['first_observation_time'] = smartlog_data_201710_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201710_fault_A_df['time_diff'] = (smartlog_data_201710_fault_A_df['fault_time'] - smartlog_data_201710_fault_A_df['first_observation_time']).dt.days
smartlog_data_201710_fault_B_df['time_diff'] = (smartlog_data_201710_fault_B_df['fault_time'] - smartlog_data_201710_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201710_fault_A_df = smartlog_data_201710_fault_A_df[(smartlog_data_201710_fault_A_df['time_diff'] >= 0) | (smartlog_data_201710_fault_A_df['fault_time'] <= '2017-10-05')]
smartlog_data_201710_fault_B_df = smartlog_data_201710_fault_B_df[(smartlog_data_201710_fault_B_df['time_diff'] >= 0) | (smartlog_data_201710_fault_B_df['fault_time'] <= '2017-10-05')]
# 选取属性进行测试
smartlog_data_201710_fault_A_df = smartlog_data_201710_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201710_fault_B_df = smartlog_data_201710_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201710_fault_df
gc.collect()
smartlog_data_201710_fault_A_df


Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_100265,1,0.0,23.0,22.0,23.0,100.0,84.0,2017-10-01,2018-11-01,2017-10-01,396
1,disk_100265,1,0.0,23.0,22.0,23.0,100.0,84.0,2017-10-23,2018-11-01,2017-10-01,396
2,disk_100265,1,0.0,23.0,22.0,23.0,100.0,84.0,2017-10-02,2018-11-01,2017-10-01,396
3,disk_100265,1,0.0,23.0,22.0,23.0,100.0,84.0,2017-10-03,2018-11-01,2017-10-01,396
4,disk_100265,1,0.0,23.0,22.0,23.0,100.0,84.0,2017-10-17,2018-11-01,2017-10-01,396
...,...,...,...,...,...,...,...,...,...,...,...,...
25886,disk_19626,1,0.0,20.0,19.0,20.0,100.0,85.0,2017-10-07,2018-09-20,2017-10-05,350
25887,disk_19626,1,0.0,20.0,19.0,20.0,100.0,85.0,2017-10-08,2018-09-20,2017-10-05,350
25888,disk_19626,1,0.0,20.0,19.0,20.0,100.0,85.0,2017-10-06,2018-09-20,2017-10-05,350
25889,disk_19626,1,0.0,20.0,19.0,20.0,100.0,85.0,2017-10-05,2018-09-20,2017-10-05,350


In [16]:
# 读取2017年11月磁盘数据集
# 读取csv文件
smartlog_data_201711_fault_df = pd.read_csv('smartlog_data_201711.csv')
# 删除空值
smartlog_data_201711_fault_df = smartlog_data_201711_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201711_fault_df['dt'] = pd.to_datetime(smartlog_data_201711_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201711_fault_A_df = smartlog_data_201711_fault_df[smartlog_data_201711_fault_df['model'] == 1]
smartlog_data_201711_fault_B_df = smartlog_data_201711_fault_df[smartlog_data_201711_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201711_fault_A_df = smartlog_data_201711_fault_A_df[smartlog_data_201711_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201711_fault_B_df = smartlog_data_201711_fault_B_df[smartlog_data_201711_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201711_fault_A_df = pd.merge(smartlog_data_201711_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201711_fault_B_df = pd.merge(smartlog_data_201711_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201711_fault_A_df['first_observation_time'] = smartlog_data_201711_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201711_fault_B_df['first_observation_time'] = smartlog_data_201711_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201711_fault_A_df['time_diff'] = (smartlog_data_201711_fault_A_df['fault_time'] - smartlog_data_201711_fault_A_df['first_observation_time']).dt.days
smartlog_data_201711_fault_B_df['time_diff'] = (smartlog_data_201711_fault_B_df['fault_time'] - smartlog_data_201711_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201711_fault_A_df = smartlog_data_201711_fault_A_df[(smartlog_data_201711_fault_A_df['time_diff'] >= 0) | (smartlog_data_201711_fault_A_df['fault_time'] <= '2017-11-05')]
smartlog_data_201711_fault_B_df = smartlog_data_201711_fault_B_df[(smartlog_data_201711_fault_B_df['time_diff'] >= 0) | (smartlog_data_201711_fault_B_df['fault_time'] <= '2017-11-05')]
# 选取属性进行测试
smartlog_data_201711_fault_A_df = smartlog_data_201711_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201711_fault_B_df = smartlog_data_201711_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201711_fault_df
gc.collect()
smartlog_data_201711_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_17921,1,18.0,13.0,12.0,13.0,100.0,87.0,2017-11-09,2018-12-10,2017-11-01,404
1,disk_17921,1,21.0,13.0,12.0,13.0,100.0,87.0,2017-11-27,2018-12-10,2017-11-01,404
2,disk_17921,1,19.0,13.0,12.0,13.0,100.0,87.0,2017-11-17,2018-12-10,2017-11-01,404
3,disk_17921,1,19.0,13.0,12.0,13.0,100.0,87.0,2017-11-24,2018-12-10,2017-11-01,404
4,disk_17921,1,18.0,13.0,12.0,13.0,100.0,87.0,2017-11-08,2018-12-10,2017-11-01,404
...,...,...,...,...,...,...,...,...,...,...,...,...
24697,disk_20993,1,0.0,41.0,40.0,42.0,100.0,82.0,2017-11-03,2017-11-03,2017-11-01,2
24698,disk_20993,1,0.0,41.0,40.0,42.0,100.0,82.0,2017-11-01,2017-11-03,2017-11-01,2
24699,disk_96363,1,0.0,20.0,19.0,20.0,100.0,87.0,2017-11-01,2017-11-01,2017-11-01,0
24700,disk_96363,1,0.0,21.0,20.0,20.0,100.0,87.0,2017-11-02,2017-11-01,2017-11-01,0


In [15]:
# 读取2017年12月磁盘数据集
# 读取csv文件
smartlog_data_201712_fault_df = pd.read_csv('smartlog_data_201712.csv')
# 删除空值
smartlog_data_201712_fault_df = smartlog_data_201712_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201712_fault_df['dt'] = pd.to_datetime(smartlog_data_201712_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201712_fault_A_df = smartlog_data_201712_fault_df[smartlog_data_201712_fault_df['model'] == 1]
smartlog_data_201712_fault_B_df = smartlog_data_201712_fault_df[smartlog_data_201712_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201712_fault_A_df = smartlog_data_201712_fault_A_df[smartlog_data_201712_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201712_fault_B_df = smartlog_data_201712_fault_B_df[smartlog_data_201712_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201712_fault_A_df = pd.merge(smartlog_data_201712_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201712_fault_B_df = pd.merge(smartlog_data_201712_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201712_fault_A_df['first_observation_time'] = smartlog_data_201712_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201712_fault_B_df['first_observation_time'] = smartlog_data_201712_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201712_fault_A_df['time_diff'] = (smartlog_data_201712_fault_A_df['fault_time'] - smartlog_data_201712_fault_A_df['first_observation_time']).dt.days
smartlog_data_201712_fault_B_df['time_diff'] = (smartlog_data_201712_fault_B_df['fault_time'] - smartlog_data_201712_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201712_fault_A_df = smartlog_data_201712_fault_A_df[(smartlog_data_201712_fault_A_df['time_diff'] >= 0) | (smartlog_data_201712_fault_A_df['fault_time'] <= '2017-12-05')]
smartlog_data_201712_fault_B_df = smartlog_data_201712_fault_B_df[(smartlog_data_201712_fault_B_df['time_diff'] >= 0) | (smartlog_data_201712_fault_B_df['fault_time'] <= '2017-12-05')]
# 选取属性进行测试
smartlog_data_201712_fault_A_df = smartlog_data_201712_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201712_fault_B_df = smartlog_data_201712_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201712_fault_df
gc.collect()
smartlog_data_201712_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_124744,1,0.0,10.0,7.0,8.0,100.0,69.0,2017-12-06,2018-07-06,2017-12-06,212
1,disk_124744,1,0.0,10.0,7.0,8.0,100.0,72.0,2017-12-17,2018-07-06,2017-12-06,212
2,disk_124744,1,0.0,10.0,7.0,8.0,100.0,74.0,2017-12-28,2018-07-06,2017-12-06,212
3,disk_124744,1,0.0,10.0,7.0,8.0,100.0,74.0,2017-12-29,2018-07-06,2017-12-06,212
4,disk_124744,1,0.0,10.0,7.0,8.0,100.0,73.0,2017-12-22,2018-07-06,2017-12-06,212
...,...,...,...,...,...,...,...,...,...,...,...,...
26613,disk_3955,1,186.0,19.0,6.0,19.0,99.0,84.0,2017-12-01,2018-02-12,2017-12-01,73
26614,disk_3955,1,223.0,19.0,6.0,19.0,99.0,84.0,2017-12-04,2018-02-12,2017-12-01,73
26615,disk_3955,1,203.0,19.0,6.0,19.0,99.0,84.0,2017-12-03,2018-02-12,2017-12-01,73
26616,disk_3955,1,192.0,19.0,6.0,19.0,99.0,84.0,2017-12-02,2018-02-12,2017-12-01,73


In [14]:
# 读取2018年01月磁盘数据集
# 读取csv文件
smartlog_data_201801_fault_df = pd.read_csv('smartlog_data_201801.csv')
# 删除空值
smartlog_data_201801_fault_df = smartlog_data_201801_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201801_fault_df['dt'] = pd.to_datetime(smartlog_data_201801_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201801_fault_A_df = smartlog_data_201801_fault_df[smartlog_data_201801_fault_df['model'] == 1]
smartlog_data_201801_fault_B_df = smartlog_data_201801_fault_df[smartlog_data_201801_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201801_fault_A_df = smartlog_data_201801_fault_A_df[smartlog_data_201801_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201801_fault_B_df = smartlog_data_201801_fault_B_df[smartlog_data_201801_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201801_fault_A_df = pd.merge(smartlog_data_201801_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201801_fault_B_df = pd.merge(smartlog_data_201801_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201801_fault_A_df['first_observation_time'] = smartlog_data_201801_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201801_fault_B_df['first_observation_time'] = smartlog_data_201801_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201801_fault_A_df['time_diff'] = (smartlog_data_201801_fault_A_df['fault_time'] - smartlog_data_201801_fault_A_df['first_observation_time']).dt.days
smartlog_data_201801_fault_B_df['time_diff'] = (smartlog_data_201801_fault_B_df['fault_time'] - smartlog_data_201801_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201801_fault_A_df = smartlog_data_201801_fault_A_df[(smartlog_data_201801_fault_A_df['time_diff'] >= 0) | (smartlog_data_201801_fault_A_df['fault_time'] <= '2018-01-05')]
smartlog_data_201801_fault_B_df = smartlog_data_201801_fault_B_df[(smartlog_data_201801_fault_B_df['time_diff'] >= 0) | (smartlog_data_201801_fault_B_df['fault_time'] <= '2018-01-05')]
# 选取属性进行测试
smartlog_data_201801_fault_A_df = smartlog_data_201801_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201801_fault_B_df = smartlog_data_201801_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201801_fault_df
gc.collect()
smartlog_data_201801_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_50434,1,1.0,8.0,7.0,8.0,100.0,93.0,2018-01-29,2018-10-30,2018-01-01,302
1,disk_50434,1,1.0,8.0,7.0,8.0,100.0,93.0,2018-01-02,2018-10-30,2018-01-01,302
2,disk_50434,1,1.0,8.0,7.0,8.0,100.0,93.0,2018-01-20,2018-10-30,2018-01-01,302
3,disk_50434,1,1.0,8.0,7.0,8.0,100.0,93.0,2018-01-17,2018-10-30,2018-01-01,302
4,disk_50434,1,1.0,8.0,7.0,8.0,100.0,93.0,2018-01-01,2018-10-30,2018-01-01,302
...,...,...,...,...,...,...,...,...,...,...,...,...
26825,disk_18624,1,0.0,27.0,26.0,27.0,100.0,86.0,2018-01-05,2018-01-04,2018-01-02,2
26826,disk_18624,1,0.0,16.0,15.0,16.0,100.0,86.0,2018-01-04,2018-01-04,2018-01-02,2
26827,disk_18624,1,0.0,16.0,15.0,16.0,100.0,86.0,2018-01-02,2018-01-04,2018-01-02,2
26828,disk_18624,1,0.0,16.0,15.0,16.0,100.0,86.0,2018-01-03,2018-01-04,2018-01-02,2


In [13]:
# 读取2018年02月磁盘数据集
# 读取csv文件
smartlog_data_201802_fault_df = pd.read_csv('smartlog_data_201802.csv')
# 删除空值
smartlog_data_201802_fault_df = smartlog_data_201802_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201802_fault_df['dt'] = pd.to_datetime(smartlog_data_201802_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201802_fault_A_df = smartlog_data_201802_fault_df[smartlog_data_201802_fault_df['model'] == 1]
smartlog_data_201802_fault_B_df = smartlog_data_201802_fault_df[smartlog_data_201802_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201802_fault_A_df = smartlog_data_201802_fault_A_df[smartlog_data_201802_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201802_fault_B_df = smartlog_data_201802_fault_B_df[smartlog_data_201802_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201802_fault_A_df = pd.merge(smartlog_data_201802_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201802_fault_B_df = pd.merge(smartlog_data_201802_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201802_fault_A_df['first_observation_time'] = smartlog_data_201802_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201802_fault_B_df['first_observation_time'] = smartlog_data_201802_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201802_fault_A_df['time_diff'] = (smartlog_data_201802_fault_A_df['fault_time'] - smartlog_data_201802_fault_A_df['first_observation_time']).dt.days
smartlog_data_201802_fault_B_df['time_diff'] = (smartlog_data_201802_fault_B_df['fault_time'] - smartlog_data_201802_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201802_fault_A_df = smartlog_data_201802_fault_A_df[(smartlog_data_201802_fault_A_df['time_diff'] >= 0) | (smartlog_data_201802_fault_A_df['fault_time'] <= '2018-02-05')]
smartlog_data_201802_fault_B_df = smartlog_data_201802_fault_B_df[(smartlog_data_201802_fault_B_df['time_diff'] >= 0) | (smartlog_data_201802_fault_B_df['fault_time'] <= '2018-02-05')]
# 选取属性进行测试
smartlog_data_201802_fault_A_df = smartlog_data_201802_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201802_fault_B_df = smartlog_data_201802_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201802_fault_df
gc.collect()
smartlog_data_201802_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_25828,1,0.0,9.0,8.0,9.0,100.0,86.0,2018-02-07,2018-05-31,2018-02-01,119
1,disk_25828,1,0.0,9.0,8.0,9.0,100.0,86.0,2018-02-13,2018-05-31,2018-02-01,119
2,disk_25828,1,0.0,9.0,8.0,9.0,100.0,86.0,2018-02-10,2018-05-31,2018-02-01,119
3,disk_25828,1,0.0,9.0,8.0,9.0,100.0,86.0,2018-02-04,2018-05-31,2018-02-01,119
4,disk_25828,1,0.0,9.0,8.0,9.0,100.0,86.0,2018-02-17,2018-05-31,2018-02-01,119
...,...,...,...,...,...,...,...,...,...,...,...,...
22831,disk_5066,1,0.0,26.0,22.0,26.0,100.0,87.0,2018-02-01,2018-05-04,2018-02-01,92
22832,disk_5066,1,0.0,26.0,22.0,26.0,100.0,87.0,2018-02-02,2018-05-04,2018-02-01,92
22833,disk_5066,1,0.0,26.0,22.0,26.0,100.0,87.0,2018-02-04,2018-05-04,2018-02-01,92
22834,disk_32617,1,164.0,90.0,85.0,89.0,99.0,93.0,2018-02-01,2018-02-01,2018-02-01,0


In [12]:
# 读取2018年03月磁盘数据集
# 读取csv文件
smartlog_data_201803_fault_df = pd.read_csv('smartlog_data_201803.csv')
# 删除空值
smartlog_data_201803_fault_df = smartlog_data_201803_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201803_fault_df['dt'] = pd.to_datetime(smartlog_data_201803_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201803_fault_A_df = smartlog_data_201803_fault_df[smartlog_data_201803_fault_df['model'] == 1]
smartlog_data_201803_fault_B_df = smartlog_data_201803_fault_df[smartlog_data_201803_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201803_fault_A_df = smartlog_data_201803_fault_A_df[smartlog_data_201803_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201803_fault_B_df = smartlog_data_201803_fault_B_df[smartlog_data_201803_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201803_fault_A_df = pd.merge(smartlog_data_201803_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201803_fault_B_df = pd.merge(smartlog_data_201803_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201803_fault_A_df['first_observation_time'] = smartlog_data_201803_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201803_fault_B_df['first_observation_time'] = smartlog_data_201803_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201803_fault_A_df['time_diff'] = (smartlog_data_201803_fault_A_df['fault_time'] - smartlog_data_201803_fault_A_df['first_observation_time']).dt.days
smartlog_data_201803_fault_B_df['time_diff'] = (smartlog_data_201803_fault_B_df['fault_time'] - smartlog_data_201803_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201803_fault_A_df = smartlog_data_201803_fault_A_df[(smartlog_data_201803_fault_A_df['time_diff'] >= 0) | (smartlog_data_201803_fault_A_df['fault_time'] <= '2018-03-05')]
smartlog_data_201803_fault_B_df = smartlog_data_201803_fault_B_df[(smartlog_data_201803_fault_B_df['time_diff'] >= 0) | (smartlog_data_201803_fault_B_df['fault_time'] <= '2018-03-05')]
# 选取属性进行测试
smartlog_data_201803_fault_A_df = smartlog_data_201803_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201803_fault_B_df = smartlog_data_201803_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201803_fault_df
gc.collect()
smartlog_data_201803_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_131238,1,0.0,7.0,6.0,7.0,100.0,89.0,2018-03-09,2018-08-13,2018-03-01,165
1,disk_131238,1,0.0,7.0,6.0,7.0,100.0,89.0,2018-03-18,2018-08-13,2018-03-01,165
2,disk_131238,1,0.0,7.0,6.0,7.0,100.0,89.0,2018-03-13,2018-08-13,2018-03-01,165
3,disk_131238,1,0.0,7.0,6.0,7.0,100.0,89.0,2018-03-31,2018-08-13,2018-03-01,165
4,disk_131238,1,0.0,7.0,6.0,7.0,100.0,89.0,2018-03-14,2018-08-13,2018-03-01,165
...,...,...,...,...,...,...,...,...,...,...,...,...
23507,disk_127609,1,0.0,24.0,22.0,23.0,100.0,87.0,2018-03-05,2018-03-06,2018-03-01,5
23508,disk_13671,1,0.0,25.0,20.0,25.0,100.0,84.0,2018-03-01,2018-11-17,2018-03-01,261
23509,disk_119072,1,61.0,15.0,14.0,15.0,100.0,95.0,2018-03-02,2018-03-02,2018-03-01,1
23510,disk_119072,1,52.0,15.0,14.0,15.0,100.0,95.0,2018-03-01,2018-03-02,2018-03-01,1


In [11]:
# 读取2018年04月磁盘数据集
# 读取csv文件
smartlog_data_201804_fault_df = pd.read_csv('smartlog_data_201804.csv')
# 删除空值
smartlog_data_201804_fault_df = smartlog_data_201804_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201804_fault_df['dt'] = pd.to_datetime(smartlog_data_201804_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201804_fault_A_df = smartlog_data_201804_fault_df[smartlog_data_201804_fault_df['model'] == 1]
smartlog_data_201804_fault_B_df = smartlog_data_201804_fault_df[smartlog_data_201804_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201804_fault_A_df = smartlog_data_201804_fault_A_df[smartlog_data_201804_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201804_fault_B_df = smartlog_data_201804_fault_B_df[smartlog_data_201804_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201804_fault_A_df = pd.merge(smartlog_data_201804_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201804_fault_B_df = pd.merge(smartlog_data_201804_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201804_fault_A_df['first_observation_time'] = smartlog_data_201804_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201804_fault_B_df['first_observation_time'] = smartlog_data_201804_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201804_fault_A_df['time_diff'] = (smartlog_data_201804_fault_A_df['fault_time'] - smartlog_data_201804_fault_A_df['first_observation_time']).dt.days
smartlog_data_201804_fault_B_df['time_diff'] = (smartlog_data_201804_fault_B_df['fault_time'] - smartlog_data_201804_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201804_fault_A_df = smartlog_data_201804_fault_A_df[(smartlog_data_201804_fault_A_df['time_diff'] >= 0) | (smartlog_data_201804_fault_A_df['fault_time'] <= '2018-04-05')]
smartlog_data_201804_fault_B_df = smartlog_data_201804_fault_B_df[(smartlog_data_201804_fault_B_df['time_diff'] >= 0) | (smartlog_data_201804_fault_B_df['fault_time'] <= '2018-04-05')]
# 选取属性进行测试
smartlog_data_201804_fault_A_df = smartlog_data_201804_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201804_fault_B_df = smartlog_data_201804_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201804_fault_df
gc.collect()
smartlog_data_201804_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_115590,1,0.0,23.0,20.0,21.0,100.0,93.0,2018-04-16,2018-10-16,2018-04-01,198
1,disk_115590,1,0.0,23.0,20.0,21.0,100.0,93.0,2018-04-11,2018-10-16,2018-04-01,198
2,disk_115590,1,0.0,23.0,20.0,21.0,100.0,93.0,2018-04-27,2018-10-16,2018-04-01,198
3,disk_115590,1,0.0,23.0,20.0,21.0,100.0,93.0,2018-04-09,2018-10-16,2018-04-01,198
4,disk_115590,1,0.0,23.0,20.0,21.0,100.0,93.0,2018-04-28,2018-10-16,2018-04-01,198
...,...,...,...,...,...,...,...,...,...,...,...,...
20813,disk_68722,1,16.0,13.0,12.0,13.0,100.0,95.0,2018-04-02,2018-04-03,2018-04-01,2
20814,disk_23869,1,3308.0,15.0,1.0,1.0,80.0,81.0,2018-04-01,2018-03-31,2018-04-01,-1
20815,disk_80868,1,87.0,19.0,18.0,19.0,100.0,88.0,2018-04-29,2018-06-28,2018-04-28,61
20816,disk_80868,1,87.0,19.0,18.0,19.0,100.0,88.0,2018-04-28,2018-06-28,2018-04-28,61


In [10]:
# 读取2018年05月磁盘数据集
# 读取csv文件
smartlog_data_201805_fault_df = pd.read_csv('smartlog_data_201805.csv')
# 删除空值
smartlog_data_201805_fault_df = smartlog_data_201805_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201805_fault_df['dt'] = pd.to_datetime(smartlog_data_201805_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201805_fault_A_df = smartlog_data_201805_fault_df[smartlog_data_201805_fault_df['model'] == 1]
smartlog_data_201805_fault_B_df = smartlog_data_201805_fault_df[smartlog_data_201805_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201805_fault_A_df = smartlog_data_201805_fault_A_df[smartlog_data_201805_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201805_fault_B_df = smartlog_data_201805_fault_B_df[smartlog_data_201805_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201805_fault_A_df = pd.merge(smartlog_data_201805_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201805_fault_B_df = pd.merge(smartlog_data_201805_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201805_fault_A_df['first_observation_time'] = smartlog_data_201805_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201805_fault_B_df['first_observation_time'] = smartlog_data_201805_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201805_fault_A_df['time_diff'] = (smartlog_data_201805_fault_A_df['fault_time'] - smartlog_data_201805_fault_A_df['first_observation_time']).dt.days
smartlog_data_201805_fault_B_df['time_diff'] = (smartlog_data_201805_fault_B_df['fault_time'] - smartlog_data_201805_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201805_fault_A_df = smartlog_data_201805_fault_A_df[(smartlog_data_201805_fault_A_df['time_diff'] >= 0) | (smartlog_data_201805_fault_A_df['fault_time'] <= '2018-05-05')]
smartlog_data_201805_fault_B_df = smartlog_data_201805_fault_B_df[(smartlog_data_201805_fault_B_df['time_diff'] >= 0) | (smartlog_data_201805_fault_B_df['fault_time'] <= '2018-05-05')]
# 选取属性进行测试
smartlog_data_201805_fault_A_df = smartlog_data_201805_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201805_fault_B_df = smartlog_data_201805_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201805_fault_df
gc.collect()
smartlog_data_201805_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_99865,1,0.0,19.0,18.0,19.0,100.0,90.0,2018-05-09,2018-11-24,2018-05-01,207
1,disk_99865,1,0.0,19.0,18.0,19.0,100.0,90.0,2018-05-20,2018-11-24,2018-05-01,207
2,disk_99865,1,0.0,19.0,18.0,19.0,100.0,90.0,2018-05-29,2018-11-24,2018-05-01,207
3,disk_99865,1,0.0,19.0,18.0,19.0,100.0,90.0,2018-05-08,2018-11-24,2018-05-01,207
4,disk_99865,1,0.0,19.0,18.0,19.0,100.0,90.0,2018-05-25,2018-11-24,2018-05-01,207
...,...,...,...,...,...,...,...,...,...,...,...,...
19865,disk_123239,1,0.0,21.0,18.0,19.0,100.0,94.0,2018-05-12,2018-05-11,2018-05-01,10
19866,disk_123239,1,0.0,21.0,18.0,19.0,100.0,94.0,2018-05-01,2018-05-11,2018-05-01,10
19867,disk_123239,1,0.0,21.0,18.0,19.0,100.0,94.0,2018-05-09,2018-05-11,2018-05-01,10
19868,disk_123239,1,0.0,21.0,18.0,19.0,100.0,94.0,2018-05-06,2018-05-11,2018-05-01,10


In [9]:
# 读取2018年06月磁盘数据集
# 读取csv文件
smartlog_data_201806_fault_df = pd.read_csv('smartlog_data_201806.csv')
# 删除空值
smartlog_data_201806_fault_df = smartlog_data_201806_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201806_fault_df['dt'] = pd.to_datetime(smartlog_data_201806_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201806_fault_A_df = smartlog_data_201806_fault_df[smartlog_data_201806_fault_df['model'] == 1]
smartlog_data_201806_fault_B_df = smartlog_data_201806_fault_df[smartlog_data_201806_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201806_fault_A_df = smartlog_data_201806_fault_A_df[smartlog_data_201806_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201806_fault_B_df = smartlog_data_201806_fault_B_df[smartlog_data_201806_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201806_fault_A_df = pd.merge(smartlog_data_201806_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201806_fault_B_df = pd.merge(smartlog_data_201806_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201806_fault_A_df['first_observation_time'] = smartlog_data_201806_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201806_fault_B_df['first_observation_time'] = smartlog_data_201806_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201806_fault_A_df['time_diff'] = (smartlog_data_201806_fault_A_df['fault_time'] - smartlog_data_201806_fault_A_df['first_observation_time']).dt.days
smartlog_data_201806_fault_B_df['time_diff'] = (smartlog_data_201806_fault_B_df['fault_time'] - smartlog_data_201806_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201806_fault_A_df = smartlog_data_201806_fault_A_df[(smartlog_data_201806_fault_A_df['time_diff'] >= 0) | (smartlog_data_201806_fault_A_df['fault_time'] <= '2018-06-05')]
smartlog_data_201806_fault_B_df = smartlog_data_201806_fault_B_df[(smartlog_data_201806_fault_B_df['time_diff'] >= 0) | (smartlog_data_201806_fault_B_df['fault_time'] <= '2018-06-05')]
# 选取属性进行测试
smartlog_data_201806_fault_A_df = smartlog_data_201806_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201806_fault_B_df = smartlog_data_201806_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201806_fault_df
gc.collect()
smartlog_data_201806_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_58278,1,0.0,11.0,10.0,11.0,100.0,93.0,2018-06-22,2018-12-30,2018-06-01,212
1,disk_58278,1,0.0,11.0,10.0,11.0,100.0,93.0,2018-06-29,2018-12-30,2018-06-01,212
2,disk_58278,1,0.0,11.0,10.0,11.0,100.0,93.0,2018-06-03,2018-12-30,2018-06-01,212
3,disk_58278,1,0.0,11.0,10.0,11.0,100.0,93.0,2018-06-19,2018-12-30,2018-06-01,212
4,disk_58278,1,0.0,11.0,10.0,11.0,100.0,93.0,2018-06-28,2018-12-30,2018-06-01,212
...,...,...,...,...,...,...,...,...,...,...,...,...
17727,disk_76330,1,0.0,18.0,16.0,16.0,100.0,86.0,2018-06-03,2018-06-05,2018-06-01,4
17728,disk_76330,1,0.0,18.0,16.0,16.0,100.0,86.0,2018-06-04,2018-06-05,2018-06-01,4
17729,disk_76330,1,0.0,18.0,16.0,16.0,100.0,86.0,2018-06-01,2018-06-05,2018-06-01,4
17730,disk_43071,1,41.0,7.0,6.0,7.0,100.0,93.0,2018-06-01,2018-05-31,2018-06-01,-1


In [8]:
# 读取2018年07月磁盘数据集
# 读取csv文件
smartlog_data_201807_fault_df = pd.read_csv('smartlog_data_201807.csv')
# 删除空值
smartlog_data_201807_fault_df = smartlog_data_201807_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201807_fault_df['dt'] = pd.to_datetime(smartlog_data_201807_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201807_fault_A_df = smartlog_data_201807_fault_df[smartlog_data_201807_fault_df['model'] == 1]
smartlog_data_201807_fault_B_df = smartlog_data_201807_fault_df[smartlog_data_201807_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201807_fault_A_df = smartlog_data_201807_fault_A_df[smartlog_data_201807_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201807_fault_B_df = smartlog_data_201807_fault_B_df[smartlog_data_201807_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201807_fault_A_df = pd.merge(smartlog_data_201807_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201807_fault_B_df = pd.merge(smartlog_data_201807_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201807_fault_A_df['first_observation_time'] = smartlog_data_201807_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201807_fault_B_df['first_observation_time'] = smartlog_data_201807_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201807_fault_A_df['time_diff'] = (smartlog_data_201807_fault_A_df['fault_time'] - smartlog_data_201807_fault_A_df['first_observation_time']).dt.days
smartlog_data_201807_fault_B_df['time_diff'] = (smartlog_data_201807_fault_B_df['fault_time'] - smartlog_data_201807_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201807_fault_A_df = smartlog_data_201807_fault_A_df[(smartlog_data_201807_fault_A_df['time_diff'] >= 0) | (smartlog_data_201807_fault_A_df['fault_time'] <= '2018-07-05')]
smartlog_data_201807_fault_B_df = smartlog_data_201807_fault_B_df[(smartlog_data_201807_fault_B_df['time_diff'] >= 0) | (smartlog_data_201807_fault_B_df['fault_time'] <= '2018-07-05')]
# 选取属性进行测试
smartlog_data_201807_fault_A_df = smartlog_data_201807_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201807_fault_B_df = smartlog_data_201807_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201807_fault_df
gc.collect()
smartlog_data_201807_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_105541,1,0.0,12.0,11.0,12.0,100.0,88.0,2018-07-04,2018-11-18,2018-07-01,140
1,disk_105541,1,0.0,12.0,11.0,12.0,100.0,88.0,2018-07-14,2018-11-18,2018-07-01,140
2,disk_105541,1,0.0,12.0,11.0,12.0,100.0,88.0,2018-07-05,2018-11-18,2018-07-01,140
3,disk_105541,1,0.0,12.0,11.0,12.0,100.0,88.0,2018-07-03,2018-11-18,2018-07-01,140
4,disk_105541,1,0.0,12.0,11.0,12.0,100.0,88.0,2018-07-07,2018-11-18,2018-07-01,140
...,...,...,...,...,...,...,...,...,...,...,...,...
15704,disk_5200,1,0.0,26.0,23.0,26.0,100.0,89.0,2018-07-03,2018-07-05,2018-07-01,4
15705,disk_5200,1,0.0,26.0,23.0,26.0,100.0,89.0,2018-07-02,2018-07-05,2018-07-01,4
15706,disk_21558,1,0.0,20.0,19.0,20.0,100.0,87.0,2018-07-01,2018-06-30,2018-07-01,-1
15707,disk_67177,1,16074.0,14.0,13.0,14.0,2.0,90.0,2018-07-02,2018-07-02,2018-07-01,1


In [7]:
# 读取2018年08月磁盘数据集
# 读取csv文件
smartlog_data_201808_fault_df = pd.read_csv('smartlog_data_201808.csv')
# 删除空值
smartlog_data_201808_fault_df = smartlog_data_201808_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201808_fault_df['dt'] = pd.to_datetime(smartlog_data_201808_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201808_fault_A_df = smartlog_data_201808_fault_df[smartlog_data_201808_fault_df['model'] == 1]
smartlog_data_201808_fault_B_df = smartlog_data_201808_fault_df[smartlog_data_201808_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201808_fault_A_df = smartlog_data_201808_fault_A_df[smartlog_data_201808_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201808_fault_B_df = smartlog_data_201808_fault_B_df[smartlog_data_201808_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201808_fault_A_df = pd.merge(smartlog_data_201808_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201808_fault_B_df = pd.merge(smartlog_data_201808_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201808_fault_A_df['first_observation_time'] = smartlog_data_201808_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201808_fault_B_df['first_observation_time'] = smartlog_data_201808_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201808_fault_A_df['time_diff'] = (smartlog_data_201808_fault_A_df['fault_time'] - smartlog_data_201808_fault_A_df['first_observation_time']).dt.days
smartlog_data_201808_fault_B_df['time_diff'] = (smartlog_data_201808_fault_B_df['fault_time'] - smartlog_data_201808_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201808_fault_A_df = smartlog_data_201808_fault_A_df[(smartlog_data_201808_fault_A_df['time_diff'] >= 0) | (smartlog_data_201808_fault_A_df['fault_time'] <= '2018-08-05')]
smartlog_data_201808_fault_B_df = smartlog_data_201808_fault_B_df[(smartlog_data_201808_fault_B_df['time_diff'] >= 0) | (smartlog_data_201808_fault_B_df['fault_time'] <= '2018-08-05')]
# 选取属性进行测试
smartlog_data_201808_fault_A_df = smartlog_data_201808_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201808_fault_B_df = smartlog_data_201808_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201808_fault_df
gc.collect()
smartlog_data_201808_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_120668,1,0.0,114.0,104.0,106.0,100.0,93.0,2018-08-15,2018-09-05,2018-08-01,35
1,disk_120668,1,0.0,114.0,104.0,106.0,100.0,93.0,2018-08-28,2018-09-05,2018-08-01,35
2,disk_120668,1,0.0,114.0,104.0,106.0,100.0,93.0,2018-08-27,2018-09-05,2018-08-01,35
3,disk_120668,1,0.0,114.0,104.0,106.0,100.0,93.0,2018-08-12,2018-09-05,2018-08-01,35
4,disk_120668,1,0.0,114.0,104.0,106.0,100.0,93.0,2018-08-03,2018-09-05,2018-08-01,35
...,...,...,...,...,...,...,...,...,...,...,...,...
13602,disk_33031,1,1.0,13.0,12.0,13.0,100.0,94.0,2018-08-01,2018-08-01,2018-08-01,0
13603,disk_33031,1,1.0,13.0,12.0,13.0,100.0,94.0,2018-08-01,2018-08-01,2018-08-01,0
13604,disk_33031,1,1.0,13.0,12.0,13.0,100.0,94.0,2018-08-02,2018-08-01,2018-08-01,0
13605,disk_33031,1,1.0,13.0,12.0,13.0,100.0,94.0,2018-08-02,2018-08-01,2018-08-01,0


In [6]:
# 读取2018年09月磁盘数据集
# 读取csv文件
smartlog_data_201809_fault_df = pd.read_csv('smartlog_data_201809.csv')
# 删除空值
smartlog_data_201809_fault_df = smartlog_data_201809_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201809_fault_df['dt'] = pd.to_datetime(smartlog_data_201809_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201809_fault_A_df = smartlog_data_201809_fault_df[smartlog_data_201809_fault_df['model'] == 1]
smartlog_data_201809_fault_B_df = smartlog_data_201809_fault_df[smartlog_data_201809_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201809_fault_A_df = smartlog_data_201809_fault_A_df[smartlog_data_201809_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201809_fault_B_df = smartlog_data_201809_fault_B_df[smartlog_data_201809_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201809_fault_A_df = pd.merge(smartlog_data_201809_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201809_fault_B_df = pd.merge(smartlog_data_201809_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201809_fault_A_df['first_observation_time'] = smartlog_data_201809_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201809_fault_B_df['first_observation_time'] = smartlog_data_201809_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201809_fault_A_df['time_diff'] = (smartlog_data_201809_fault_A_df['fault_time'] - smartlog_data_201809_fault_A_df['first_observation_time']).dt.days
smartlog_data_201809_fault_B_df['time_diff'] = (smartlog_data_201809_fault_B_df['fault_time'] - smartlog_data_201809_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201809_fault_A_df = smartlog_data_201809_fault_A_df[(smartlog_data_201809_fault_A_df['time_diff'] >= 0) | (smartlog_data_201809_fault_A_df['fault_time'] <= '2018-09-05')]
smartlog_data_201809_fault_B_df = smartlog_data_201809_fault_B_df[(smartlog_data_201809_fault_B_df['time_diff'] >= 0) | (smartlog_data_201809_fault_B_df['fault_time'] <= '2018-09-05')]
# 选取属性进行测试
smartlog_data_201809_fault_A_df = smartlog_data_201809_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201809_fault_B_df = smartlog_data_201809_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201809_fault_df
gc.collect()
smartlog_data_201809_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_16037,1,0.0,20.0,19.0,20.0,100.0,90.0,2018-09-23,2018-12-26,2018-09-01,116
1,disk_16037,1,0.0,20.0,19.0,20.0,100.0,90.0,2018-09-06,2018-12-26,2018-09-01,116
2,disk_16037,1,0.0,20.0,19.0,20.0,100.0,90.0,2018-09-17,2018-12-26,2018-09-01,116
3,disk_16037,1,0.0,20.0,19.0,20.0,100.0,90.0,2018-09-08,2018-12-26,2018-09-01,116
4,disk_16037,1,0.0,20.0,19.0,20.0,100.0,90.0,2018-09-15,2018-12-26,2018-09-01,116
...,...,...,...,...,...,...,...,...,...,...,...,...
10457,disk_39472,1,69.0,9.0,7.0,8.0,100.0,82.0,2018-09-02,2018-09-02,2018-09-01,1
10458,disk_39472,1,69.0,9.0,7.0,8.0,100.0,82.0,2018-09-03,2018-09-02,2018-09-01,1
10459,disk_39472,1,69.0,9.0,7.0,8.0,100.0,94.0,2018-09-01,2018-09-02,2018-09-01,1
10460,disk_52300,1,0.0,19.0,16.0,18.0,100.0,91.0,2018-09-01,2018-08-31,2018-09-01,-1


In [5]:
# 读取2018年10月磁盘数据集
# 读取csv文件
smartlog_data_201810_fault_df = pd.read_csv('smartlog_data_201810.csv')
# 删除空值
smartlog_data_201810_fault_df = smartlog_data_201810_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201810_fault_df['dt'] = pd.to_datetime(smartlog_data_201810_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201810_fault_A_df = smartlog_data_201810_fault_df[smartlog_data_201810_fault_df['model'] == 1]
smartlog_data_201810_fault_B_df = smartlog_data_201810_fault_df[smartlog_data_201810_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201810_fault_A_df = smartlog_data_201810_fault_A_df[smartlog_data_201810_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201810_fault_B_df = smartlog_data_201810_fault_B_df[smartlog_data_201810_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201810_fault_A_df = pd.merge(smartlog_data_201810_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201810_fault_B_df = pd.merge(smartlog_data_201810_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201810_fault_A_df['first_observation_time'] = smartlog_data_201810_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201810_fault_B_df['first_observation_time'] = smartlog_data_201810_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201810_fault_A_df['time_diff'] = (smartlog_data_201810_fault_A_df['fault_time'] - smartlog_data_201810_fault_A_df['first_observation_time']).dt.days
smartlog_data_201810_fault_B_df['time_diff'] = (smartlog_data_201810_fault_B_df['fault_time'] - smartlog_data_201810_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201810_fault_A_df = smartlog_data_201810_fault_A_df[(smartlog_data_201810_fault_A_df['time_diff'] >= 0) | (smartlog_data_201810_fault_A_df['fault_time'] <= '2018-10-05')]
smartlog_data_201810_fault_B_df = smartlog_data_201810_fault_B_df[(smartlog_data_201810_fault_B_df['time_diff'] >= 0) | (smartlog_data_201810_fault_B_df['fault_time'] <= '2018-10-05')]
# 选取属性进行测试
smartlog_data_201810_fault_A_df = smartlog_data_201810_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
smartlog_data_201810_fault_B_df = smartlog_data_201810_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201810_fault_df
gc.collect()
smartlog_data_201810_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_105394,1,0.0,12.0,11.0,12.0,100.0,89.0,2018-10-11,2018-12-23,2018-10-01,83
1,disk_105394,1,0.0,12.0,11.0,12.0,100.0,89.0,2018-10-31,2018-12-23,2018-10-01,83
2,disk_105394,1,0.0,12.0,11.0,12.0,100.0,89.0,2018-10-30,2018-12-23,2018-10-01,83
3,disk_105394,1,0.0,12.0,11.0,12.0,100.0,89.0,2018-10-17,2018-12-23,2018-10-01,83
4,disk_105394,1,0.0,12.0,11.0,12.0,100.0,89.0,2018-10-19,2018-12-23,2018-10-01,83
...,...,...,...,...,...,...,...,...,...,...,...,...
8203,disk_49300,1,0.0,20.0,19.0,20.0,100.0,95.0,2018-10-10,2018-10-11,2018-10-03,8
8204,disk_49300,1,0.0,20.0,19.0,20.0,100.0,95.0,2018-10-07,2018-10-11,2018-10-03,8
8205,disk_49300,1,0.0,20.0,19.0,20.0,100.0,95.0,2018-10-06,2018-10-11,2018-10-03,8
8206,disk_49300,1,0.0,20.0,19.0,20.0,100.0,95.0,2018-10-03,2018-10-11,2018-10-03,8


In [4]:
# 读取2018年11月磁盘数据集
# 读取csv文件
smartlog_data_201811_fault_df = pd.read_csv('smartlog_data_201811.csv')
# Remove null values
smartlog_data_201811_fault_df = smartlog_data_201811_fault_df.dropna(axis=1, how='all')
# Convert the format of dt into a time series, which is convenient for comparison with the failure time
smartlog_data_201811_fault_df['dt'] = pd.to_datetime(smartlog_data_201811_fault_df['dt'], format='%Y%m%d')
# Bring together data from the same model
smartlog_data_201811_fault_A_df = smartlog_data_201811_fault_df[smartlog_data_201811_fault_df['model'] == 1]
smartlog_data_201811_fault_B_df = smartlog_data_201811_fault_df[smartlog_data_201811_fault_df['model'] == 2]
# Filter failed disk information
smartlog_data_201811_fault_A_df = smartlog_data_201811_fault_A_df[smartlog_data_201811_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201811_fault_B_df = smartlog_data_201811_fault_B_df[smartlog_data_201811_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# Mixed datasets for easy comparison
smartlog_data_201811_fault_A_df = pd.merge(smartlog_data_201811_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201811_fault_B_df = pd.merge(smartlog_data_201811_fault_B_df, fault_tag_B_df, on='serial_number')
# Compute the earliest observed time for each disk
smartlog_data_201811_fault_A_df['first_observation_time'] = smartlog_data_201811_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201811_fault_B_df['first_observation_time'] = smartlog_data_201811_fault_B_df.groupby('serial_number')['dt'].transform('min')
# Calculate the difference between the failure time and the earliest observed time
smartlog_data_201811_fault_A_df['time_diff'] = (smartlog_data_201811_fault_A_df['fault_time'] - smartlog_data_201811_fault_A_df['first_observation_time']).dt.days
smartlog_data_201811_fault_B_df['time_diff'] = (smartlog_data_201811_fault_B_df['fault_time'] - smartlog_data_201811_fault_B_df['first_observation_time']).dt.days
# Filter out the data of those disks whose earliest failure time is greater than 0 days from the last observation time
smartlog_data_201811_fault_A_df = smartlog_data_201811_fault_A_df[(smartlog_data_201811_fault_A_df['time_diff'] >= 0) | 
                                                                  (smartlog_data_201811_fault_A_df['fault_time'] <= '2018-11-05')]
smartlog_data_201811_fault_B_df = smartlog_data_201811_fault_B_df[(smartlog_data_201811_fault_B_df['time_diff'] >= 0) | 
                                                                  (smartlog_data_201811_fault_B_df['fault_time'] <= '2018-11-05')]
# Select attribute
smartlog_data_201811_fault_A_df = smartlog_data_201811_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 'smart_12raw', 
                                                                   'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 'first_observation_time', 
                                                                   'time_diff']]
smartlog_data_201811_fault_B_df = smartlog_data_201811_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 
                                                                   'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 
                                                                   'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 
                                                                   'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 
                                                                   'first_observation_time', 'time_diff']]
# Clear memory
del smartlog_data_201811_fault_df
gc.collect()
smartlog_data_201811_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_147809,1,0.0,25.0,23.0,24.0,100.0,90.0,2018-11-22,2018-12-28,2018-11-01,57
1,disk_147809,1,0.0,25.0,23.0,24.0,100.0,90.0,2018-11-19,2018-12-28,2018-11-01,57
2,disk_147809,1,0.0,25.0,23.0,24.0,100.0,90.0,2018-11-24,2018-12-28,2018-11-01,57
3,disk_147809,1,0.0,25.0,23.0,24.0,100.0,90.0,2018-11-03,2018-12-28,2018-11-01,57
4,disk_147809,1,0.0,25.0,23.0,24.0,100.0,90.0,2018-11-04,2018-12-28,2018-11-01,57
...,...,...,...,...,...,...,...,...,...,...,...,...
4709,disk_107404,1,0.0,21.0,16.0,17.0,100.0,95.0,2018-11-03,2018-11-19,2018-11-03,16
4710,disk_129666,1,112.0,17.0,16.0,16.0,98.0,84.0,2018-11-01,2018-10-31,2018-11-01,-1
4711,disk_144254,1,661.0,22.0,21.0,22.0,96.0,89.0,2018-11-01,2018-10-31,2018-11-01,-1
4712,disk_93666,1,2.0,22.0,21.0,22.0,100.0,89.0,2018-11-01,2018-10-31,2018-11-01,-1


In [3]:
# 读取2018年12月磁盘数据集
# 读取csv文件
smartlog_data_201812_fault_df = pd.read_csv('smartlog_data_201812.csv')
# 删除空值
smartlog_data_201812_fault_df = smartlog_data_201812_fault_df.dropna(axis=1, how='all')
# 将dt的格式转化成时间序列，方便和故障时间进行比较
smartlog_data_201812_fault_df['dt'] = pd.to_datetime(smartlog_data_201812_fault_df['dt'], format='%Y%m%d')
# 将同一个模型的数据归结到一起
smartlog_data_201812_fault_A_df = smartlog_data_201812_fault_df[smartlog_data_201812_fault_df['model'] == 1]
smartlog_data_201812_fault_B_df = smartlog_data_201812_fault_df[smartlog_data_201812_fault_df['model'] == 2]
# 筛选发生故障的磁盘信息
smartlog_data_201812_fault_A_df = smartlog_data_201812_fault_A_df[smartlog_data_201812_fault_A_df['serial_number'].isin(fault_tag_A_df['serial_number'])]
smartlog_data_201812_fault_B_df = smartlog_data_201812_fault_B_df[smartlog_data_201812_fault_B_df['serial_number'].isin(fault_tag_B_df['serial_number'])]
# 混合数据集，方便比较
smartlog_data_201812_fault_A_df = pd.merge(smartlog_data_201812_fault_A_df, fault_tag_A_df, on='serial_number')
smartlog_data_201812_fault_B_df = pd.merge(smartlog_data_201812_fault_B_df, fault_tag_B_df, on='serial_number')
# 计算每个磁盘的最早一次观察时间
smartlog_data_201812_fault_A_df['first_observation_time'] = smartlog_data_201812_fault_A_df.groupby('serial_number')['dt'].transform('min')
smartlog_data_201812_fault_B_df['first_observation_time'] = smartlog_data_201812_fault_B_df.groupby('serial_number')['dt'].transform('min')
# 计算故障时间与最早一次观察时间的差值
smartlog_data_201812_fault_A_df['time_diff'] = (smartlog_data_201812_fault_A_df['fault_time'] - smartlog_data_201812_fault_A_df['first_observation_time']).dt.days
smartlog_data_201812_fault_B_df['time_diff'] = (smartlog_data_201812_fault_B_df['fault_time'] - smartlog_data_201812_fault_B_df['first_observation_time']).dt.days
# 筛选出那些最早的故障时间距离最后一次观察时间大于0天的磁盘的数据
smartlog_data_201812_fault_A_df = smartlog_data_201812_fault_A_df[(smartlog_data_201812_fault_A_df['time_diff'] >= 0) | 
                                                                  (smartlog_data_201812_fault_A_df['fault_time'] <= '2018-12-05')]
smartlog_data_201812_fault_B_df = smartlog_data_201812_fault_B_df[(smartlog_data_201812_fault_B_df['time_diff'] >= 0) | 
                                                                  (smartlog_data_201812_fault_B_df['fault_time'] <= '2018-12-05')]
# 选取属性进行测试
smartlog_data_201812_fault_A_df = smartlog_data_201812_fault_A_df[['serial_number', 'model', 'smart_5raw', 'smart_4raw', 'smart_192raw', 
                                                                   'smart_12raw', 'smart_5_normalized', 'smart_7_normalized', 'dt', 'fault_time', 
                                                                   'first_observation_time', 'time_diff']]
smartlog_data_201812_fault_B_df = smartlog_data_201812_fault_B_df[['serial_number', 'model', 'smart_9_normalized', 'smart_191_normalized', 'smart_192raw', 
                                                                   'smart_195_normalized', 'smart_5raw', 'smart_199raw', 'smart_190_normalized', 'smart_188raw', 
                                                                   'smart_194_normalized', 'smart_194raw', 'smart_190raw' , 'smart_191raw', 'smart_7_normalized', 
                                                                   'smart_9raw', 'smart_7raw', 'smart_242raw', 'smart_241raw', 'dt', 'fault_time', 
                                                                   'first_observation_time', 'time_diff']]
# 清除内存
del smartlog_data_201812_fault_df
gc.collect()
smartlog_data_201812_fault_A_df

Unnamed: 0,serial_number,model,smart_5raw,smart_4raw,smart_192raw,smart_12raw,smart_5_normalized,smart_7_normalized,dt,fault_time,first_observation_time,time_diff
0,disk_61777,1,0.0,9.0,8.0,9.0,100.0,88.0,2018-12-16,2018-12-30,2018-12-09,21
1,disk_61777,1,0.0,9.0,8.0,9.0,100.0,88.0,2018-12-21,2018-12-30,2018-12-09,21
2,disk_61777,1,0.0,9.0,8.0,9.0,100.0,88.0,2018-12-22,2018-12-30,2018-12-09,21
3,disk_61777,1,0.0,9.0,8.0,9.0,100.0,88.0,2018-12-11,2018-12-30,2018-12-09,21
4,disk_61777,1,0.0,9.0,8.0,9.0,100.0,88.0,2018-12-19,2018-12-30,2018-12-09,21
...,...,...,...,...,...,...,...,...,...,...,...,...
2394,disk_73603,1,429.0,27.0,25.0,26.0,97.0,84.0,2018-12-04,2018-12-04,2018-12-01,3
2395,disk_73603,1,428.0,27.0,25.0,26.0,97.0,84.0,2018-12-02,2018-12-04,2018-12-01,3
2396,disk_35598,1,18.0,9.0,8.0,9.0,100.0,95.0,2018-12-01,2018-12-02,2018-12-01,1
2397,disk_35598,1,66.0,9.0,8.0,9.0,100.0,89.0,2018-12-02,2018-12-02,2018-12-01,1


In [24]:
# 合并所有A类型故障磁盘数据
smartlog_data_fault_A_df = pd.concat([smartlog_data_201707_fault_A_df, smartlog_data_201708_fault_A_df, smartlog_data_201709_fault_A_df, smartlog_data_201710_fault_A_df, smartlog_data_201711_fault_A_df, smartlog_data_201712_fault_A_df, smartlog_data_201801_fault_A_df, smartlog_data_201802_fault_A_df, smartlog_data_201803_fault_A_df, smartlog_data_201804_fault_A_df, smartlog_data_201805_fault_A_df, smartlog_data_201806_fault_A_df, smartlog_data_201807_fault_A_df, smartlog_data_201808_fault_A_df, smartlog_data_201809_fault_A_df, smartlog_data_201810_fault_A_df, smartlog_data_201811_fault_A_df, smartlog_data_201812_fault_A_df])
# 合并所有B类型故障磁盘数据
smartlog_data_fault_B_df = pd.concat([smartlog_data_201707_fault_B_df, smartlog_data_201708_fault_B_df, smartlog_data_201709_fault_B_df, smartlog_data_201710_fault_B_df, smartlog_data_201711_fault_B_df, smartlog_data_201712_fault_B_df, smartlog_data_201801_fault_B_df, smartlog_data_201802_fault_B_df, smartlog_data_201803_fault_B_df, smartlog_data_201804_fault_B_df, smartlog_data_201805_fault_B_df, smartlog_data_201806_fault_B_df, smartlog_data_201807_fault_B_df, smartlog_data_201808_fault_B_df, smartlog_data_201809_fault_B_df, smartlog_data_201810_fault_B_df, smartlog_data_201811_fault_B_df, smartlog_data_201812_fault_B_df])

In [25]:
print(len(smartlog_data_fault_A_df))
print(len(smartlog_data_fault_B_df))

334285
243725


#### 将数据保存到CSV文件中去

In [26]:
smartlog_data_fault_A_df.to_csv('smartlog_data_fault_A_df.csv', index=False)
smartlog_data_fault_B_df.to_csv('smartlog_data_fault_B_df.csv', index=False)