In [1]:
from Chemometrics.import_common import *
%matplotlib inline
plt.rcParams['font.sans-serif'] = 'SimHei'

In [2]:
y = pload('./data/零部件气味等级.p')

In [3]:
np.random.seed(1)

In [4]:
def rename(a):
    #drop 序号
    a = a.loc[:,['英文名称', 'CAS 编号', '中文名称','浓度值（ug/m3）']]
    #暂不考虑 缺失值
    a.columns=['EnglishName','CAS','Name','Concentration']
    return a
def complete_columns(a):
    #随机生成数据不全compound
    
    _columns = ['MatchingDegree','Odt','Npt','SmellType','Pleasantness']
    _a = pd.DataFrame(np.random.randn(len(a),len(_columns)),index=a.index,columns=_columns)
    _a.loc[:,'MatchingDegree'] = int(2)
    _a.loc[:,'MatchingDegree'] = _a.loc[:,'MatchingDegree'].apply(int)
    carpart = pd.concat([a,_a],axis=1)
    carpart.drop_duplicates('CAS',inplace=True)
    return carpart

In [5]:
def gen_TrainData(y,dfs):
    TrainData = []
    CarPartCompound = {}
    for name in y.index:
        a =  dfs[name]
        CarPartCompound['Name'] = name
        CarPartCompound['Id'] = np.random.randint(1000)
        CarPartCompound['Level'] = y.loc[name]
        CarPartCompound['Compounds'] = a.to_dict(orient='records')
        TrainData.append(CarPartCompound.copy())
    return TrainData
def gen_TrainParams(y,dfs,do_fmt=True):
    TrainParams = {}
    representative = gen_representative(dfs)
    cp = common_pattern(representative)
    TrainParams['CommonPattern'] = cp.to_dict(orient='records')
    if do_fmt:  # 是否对齐,默认对齐
        dfs = fmt(cp, dfs)
    TrainParams['TrainData'] = gen_TrainData(y,dfs)
    TrainParams['Preprocessing'] = '针对变量均一化'
    TrainParams['CrossValidation'] = 'K重交叉验证'
    TrainParams['CrossValidationK'] =  np.random.randint(10,20)
    TrainParams['NumberSimulation'] =  np.random.randint(2,10)
    TrainParams['NumberLatentVariable'] =  np.random.randint(1,10)
    TrainParams['NumberNeighbors'] =  np.random.randint(1,10)
    TrainParams['ModelName'] =  'model'+str(np.random.randint(10))
    return TrainParams

def carpart(y,dfs):
    TrainData = []
    CarPartCompound = {}
    for name in y.index:
        a =  dfs[name]
        CarPartCompound['Id'] = np.random.randint(1000)
        CarPartCompound['Name'] = name
        CarPartCompound['Compounds'] = a.to_dict(orient='records')
        TrainData.append(CarPartCompound)
    return TrainData

In [6]:
def common_pattern(rps):
    a = pd.concat(rps).reset_index(drop=True)
    cp = a.drop_duplicates('CAS').set_index('CAS')
    v = a.groupby('CAS').mean().Concentration
    cp.Concentration = v
    return cp

In [7]:
def gen_representative(dfs):
    np.random.seed(1)
    representative = {key:dfs[key]for key in np.random.choice(list(dfs),min(len(dfs),5))}
    return representative

In [8]:
#CAS号对齐
def fmt(cp,dfs):
    # rename
    a = {a:fill_withcp(dfs[a].drop_duplicates('CAS').set_index('CAS').loc[cp.index,:].fillna(0).reset_index(),cp) for a in dfs}
    return a
def fill_withcp(a,cp):
    cols = ['Concentration','Pleasantness']
    for c in cols:
        if c not in cols:
            a.loc[:,c] = cp.loc[:,c]
    return a

In [9]:
#生成对齐后的数据

In [10]:
dfs = pd.read_excel('./中汽中心/data/15种零部件全谱数据.xlsx',sheetname=None)
dfs = dfs.copy()
dfs = {a:complete_columns(rename(dfs[a])) for a in dfs}
TrainParams = gen_TrainParams(y,dfs,False)

In [11]:
#读取格式化后的数据
import json
TrainParams = gen_TrainParams(y,dfs,False)
pd.read_json(json.dumps(TrainParams['TrainData'][1]['Compounds']),orient='records').shape
#共有模式会删掉化合物

(50, 9)

In [12]:
TrainParams = gen_TrainParams(y,dfs,True)
sum(pd.read_json(json.dumps(TrainParams['TrainData'][1]['Compounds']),orient='records').Concentration>0)

35

In [13]:
import dicttoxml
xml = dicttoxml.dicttoxml(TrainParams,attr_type=False)

In [14]:
with open("./data/car_trainparams.xml",'w',encoding='utf8') as f:
    f.write(xml.decode('utf-8'))

In [15]:
try: TrainParams.pop('CommonPattern',None)
except: pass

In [16]:
with open('./data/trainparams.json','w',encoding='utf8') as f:
    json.dump(TrainParams,f,ensure_ascii=False)

In [17]:
yada_api = {'TrainData':TrainParams['TrainData']}

In [18]:
yada_api['TrainData'][0].keys()

dict_keys(['Name', 'Id', 'Level', 'Compounds'])

In [19]:
from copy import deepcopy
traindata  = []
keys = ['CAS', 'MatchingDegree', 'Name', 'Concentration', 'Odt', 'Npt', 'CharacterAndSmell', 'HealthHazard', 'TouchLimit', 'MainFunction', 'EnglishName']
for i in range(15):
    carpartcmp = {k: yada_api['TrainData'][i][k] for k in yada_api['TrainData'][i] if k != 'Name'}
    carpartcmp['Compounds'] = [{k:(cmp[k] if (k in cmp.keys()) & (k not in ['EnglishName','Name']) else k) for k in keys} for cmp in carpartcmp['Compounds'] if cmp['Name'] !=0]
    carpartcmp['Id'] = str(i)
    traindata.append({'Id':str(np.random.randint(1,1000)),'Name':yada_api['TrainData'][i]['Name'],'CarPartCompound':[deepcopy(carpartcmp)]})

In [20]:
TrainParams = {'TrainData': traindata}

In [21]:
# context manager
with open('parts_15_data_long.json','w',encoding='utf-8') as f:
    json.dump(TrainParams,f,ensure_ascii=False)

In [22]:
TrainParams['TrainData'][0].keys()

dict_keys(['Id', 'Name', 'CarPartCompound'])

In [23]:
carpartcmp.keys()

dict_keys(['Id', 'Level', 'Compounds'])

In [24]:
carpartcmp['Compounds'][0].keys()

dict_keys(['CAS', 'MatchingDegree', 'Name', 'Concentration', 'Odt', 'Npt', 'CharacterAndSmell', 'HealthHazard', 'TouchLimit', 'MainFunction', 'EnglishName'])

dict