### 1. import modules

In [16]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle
import os
import json
from ylib import ylog
from tqdm import *
import matplotlib.pyplot as plt
import cx_Oracle as cx
from collections import defaultdict 
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup

np.set_printoptions(suppress=True)
%precision %.5g
%matplotlib inline 
import logging
from gensim import corpora, models, similarities
from gensim.matutils import jaccard
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD


### 2. database connection configuration

In [7]:
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
conn_kbms = cx.connect('risk_matrix/risk_matrix@192.168.4.30:1521/orcl')
conn_patient = cx.connect('MMAPV41/MMAPV411556@192.168.4.32:1521/orcl')
ylog.set_level(logging.DEBUG)
ylog.console_on()
ylog.filelog_on("topics")

### 3. load data

In [15]:
# 解析详情页
def parse_detail(html):
    doc = pq(html)
    title = doc('.rich_media_title').text()
    content = doc('.rich_media_content').text()
    wechat_name = doc('#js_profile_qrcode > div > strong').text()
    nickname = doc('.rich_media_meta_text').text()
    wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
    date = None
    return {
        'title':title,
        'wechat_name': wechat_name,
        'content':content,
        'date':date,
        'nickname':nickname,
        'wechat':wechat
    }


In [37]:
dir_path = './data/'

In [35]:
ls_articles = []
for root, dirs, files in os.walk(dir_path):
    print(root, "has", len(files))
    for name in files:
        file_path = os.path.join(root, name)
        with open(file_path) as fp:
            article_content = fp.read()
        article = parse_detail(article_content)
        ls_articles.append(article)
    # print(sum([os.path.getsize(os.path.join(root, name)) for name in files]), '\s')
    # print("bytes in", len(files), "non-directory files")


./data/ has 0
./data/40秒 has 10
./data/Chihiro Quantitative Research has 11
./data/PureDelta has 10
./data/cfuwxd has 10
./data/codeMakeMoney has 1
./data/万孟岩 has 15
./data/东京交易厅 has 11
./data/人民币交易与研究 has 29
./data/付鹏的财经世界 has 10
./data/全球资产观察 has 10
./data/关关楼西 has 10
./data/冲浪团 has 10
./data/冷眼局中人 has 11
./data/加州分析员 has 10
./data/半夏投资 has 10
./data/华尔街情报圈 has 10
./data/博策远略 has 18
./data/向小田科技观察 has 10
./data/外汇头条 has 13
./data/宏观大类资产配置研究 has 13
./data/定投十年赚十倍 has 17
./data/岭峰资本 has 10
./data/川流不息d has 13
./data/川流不息skl has 6
./data/市场观察家 has 10
./data/扑克投资家 has 72
./data/投研帮 has 19
./data/新全球资产配置 has 10
./data/方哥谈基 has 10
./data/月风投资笔记 has 10
./data/期权交易策略 has 10
./data/李超宏观研究与资产配置 has 14
./data/沧海一土狗 has 10
./data/深圳天谷资产管理有限公司 has 10
./data/淳臻投资 has 24
./data/混沌巡洋舰 has 10
./data/湖畔公寓28号 has 10
./data/私募工场 has 73
./data/筹码 has 10
./data/聚宽量化实验室 has 16
./data/聪明投资者 has 11
./data/量化投资与机器学习 has 16
./data/量化投资大家学 has 10
./data/金融学前沿论文速递 has 10
./data/长赢指数投资 has 11
./data/靠门三思 has 10
.

In [38]:
df_articles = pd.DataFrame.from_records(ls_articles)


In [39]:
df_articles

Unnamed: 0,content,date,nickname,title,wechat,wechat_name
0,话说这几天本来在写卡尔·伊坎，但实在是被一些媒体的傻X报道给气到了，今天我必须要吐个槽。\n...,,原创： 认识最疯狂的天才,别被傻X媒体骗了，索罗斯不像他们说的那样 | 华尔街黑历史（三）,sishimiao,40秒
1,昨天在《Institutional Investor》上看到一篇文章，和你分享一下。\n\n...,,原创： 认识最疯狂的天才,"散户们, 你们的末日到了｜ 华尔街黑历史（四）",sishimiao,40秒
2,这一次，索罗斯大爷又说对了。\n\n\n在脱欧公投前，他就在《卫报》上发公开信警告英国佬不要...,,原创： 认识最疯狂的天才,一场规模空前的傻X秀｜华尔街黑历史（番外篇）,sishimiao,40秒
3,事先声明：本人和文中提及的商业机构或人物均无利益关系。\n\n\n最近经常有人让我推荐经济学...,,原创： 认识最疯狂的天才,没有宋鸿兵，就没有经济学,sishimiao,40秒
4,我这人最不喜欢转别人文章，但今天必须要破一次例。没别的原因，就是太激动了。\n\n\n要知道...,,认识最疯狂的天才,那个要把100万人送上火星的男人，究竟牛逼在哪儿？,sishimiao,40秒
5,刚才看了下留言和评论，发现很多同学看了昨天那篇，都被伊隆·马斯克的先进事迹感动得不要不要的，...,,原创： 认识最疯狂的天才,做企业家的，哪有什么白莲花？,sishimiao,40秒
6,这是我前几天给红杉资本写的一篇约稿，感觉内容还行，所以就转过来啦。本文首发于红杉汇（id:S...,,认识最疯狂的天才,驯服贝克汉姆和C罗的人,sishimiao,40秒
7,在这个星球上，你应该很难找到比尤瑟夫·卡巴杰（Youssef Kabbaj）更能忽悠的人了。...,,原创： 认识最疯狂的天才,就是这小子，把卡扎菲给忽悠瘸了 | 华尔街黑历史（五）,sishimiao,40秒
8,我看后台有很多同学嫌我发文太少不够看。好啊，今天我就发一篇8000字的，有本事你就看完它。\...,,认识最疯狂的天才,这是人类历史上最贵的一场撕逼大战 | 华尔街黑历史（六）,sishimiao,40秒
9,很多同学留言说上次聊Ackman那篇太长了，那我今天发篇短的。\n\n\n本文的主角是YC掌...,,认识最疯狂的天才,这小子正在和马斯克一起拯救世界，顺便统治一下硅谷,sishimiao,40秒


In [36]:
article

{'title': '人体所需七大类营养素与健康的关系',
 'wechat_name': '全球资产配置专家88',
 'content': '1.水：没有它七天就会死!\n2.蛋白质：伤口的自动缝合机!\n3.脂类：身体的能源库!\n4.碳水化合物：热能最主要的来源!\n5.维生素：防止疾病的重要物质!\n6.矿物质：营养的活跃分子!\n7.膳食纤维：杜绝消化道癌!\n\n\n\n\n\n一、概述：\n水是地球上最常见的物质之一，是包括人类在内所有生命生存的重要资源，也是生物体最重要的组成部分。水在生命演化中起到了重要的作用。水是一切生命所必需的物质，是饮食中的基本成份，在生命活动中有重要生理功能。\n\n\n二、作用：\n1.人体构造的主要成份，水占成人体重的50%～60%;\n2.营养物质的溶剂和运输的载体;\n3.调节体温和润滑组织。\n\n\n\n\n蛋白质\n\n\n一.概述\n蛋白质是一切生命的物质基础，约占人体总重的20%，占总固体量的45%，是构成和制造肌肉、血液、皮肤、骨骼等多种身体组织的主要物质，没有蛋白质就没有生命。\n\n\n蛋白质在人体内是一个动态平衡状态。人体内的蛋白质每天都处在不断分解和合成之中，每天约有3%的蛋白质被更新，几乎一个月内全身的蛋白质就换新一遍。每天摄入的蛋白质又不能储存，所以每天供应足够的蛋白质是非常重要的。\n\n\n蛋白质是由碳氢氧氮组成的含氮化物，基本结构是氨基酸。构成人体的氨基酸有22种，其中有9种是人体自身不能合成的，必须从饮食中摄取，称为必需氨基酸。其它13种为非必需氨基酸。氨基酸的不同组合构成人体不同种类的蛋白质。\n\n\n蛋白质作为能量代谢时，因含氮元素而不能被完全氧化，会产生尿酸、肌酐、尿素等废物经肾脏排出体外。肾功能有病变者，控制蛋白质的摄入量。\n\n\n二.作用：\n\n\n1.制造和修护人体组织。\n构成人体的肌肉、血液、皮肤、骨骼、头发、指甲等人体各种组织和器官，制造新组织，修护坏组织，如帮助伤口愈合。\n\n\n2.构成人体内多种重要生理作用的物质，如酶、激素、抗体、血红蛋白等。\n酶在人体内主要起崔化作用，参与人体的各种化学反应。\n\n\n激素在人体内主要起着重要的调节作用，促进和控制身体各种腺体、器官的活动信息。如甲状腺负责新陈代谢等。\n\n\n抗体制造免疫细胞，β淋巴细胞和T

In [33]:
ls_articles[6]

{'title': '',
 'wechat_name': '',
 'content': '',
 'date': None,
 'nickname': '',
 'wechat': ''}

In [13]:
df_patient = pd.read_pickle('../group_fraud_detection/data/patient.pkl')

### mapping

In [167]:
df_vi_mapping = pd.read_sql('select id, item_name from kbms.kbms_vflc_items', conn_kbms)
df_dr_mapping = pd.read_sql('select id, product_name from kbms.kbms_drug_instructions', conn_kbms)


In [172]:
df_vi_mapping.head()

Unnamed: 0,ID,ITEM_NAME
0,VI188,脑脊液白蛋白测定
1,VI189,脑脊液IgG测定
2,VI190,β2微球蛋白测定
3,VI4372,类胰岛素生长因子结合蛋白3(IGFBP-3)
4,VI192,尿CTx测定


In [173]:
df_dr_mapping.head()

Unnamed: 0,ID,PRODUCT_NAME
0,DR1493,复方和血丸
1,DR1154,复方银杏叶颗粒
2,DR1424,苍鹅鼻炎片
3,DR3927,金龙舒胆胶囊
4,DR3960,金刚口服液


In [174]:
dict_vi = df_vi_mapping.set_index('ID')['ITEM_NAME'].to_dict()
dict_dr = df_dr_mapping.set_index('ID')['PRODUCT_NAME'].to_dict()


#### preview patient data

In [14]:
print('patient number: ', df_patient.shape[0])
print('hospital number: ', df_patient['hos_id'].unique().shape[0])
df_patient.head()

patient number:  427632
hospital number:  136


Unnamed: 0,person_id,hos_id,med_clinic_id,hos_lv,sex,age,disease,admission_date,discharge_date,length_of_stay,amount,items_number,med_code,item_code
0,10105032,9583046520,22088875048,1,1,62,M51,2018-07-11 08:37:06,2018-07-20 08:26:24,9,4402.41,10591.0,"DE,DR1143,DR1789,DR2285,DR2948,DR868","VI1098,VI1139,VI1145,VI1147,VI174,VI1740,VI175..."
1,10107064,955225894,21822777641,3,0,83,K56,2018-06-04 10:15:53,2018-06-18 10:11:21,14,54236.63,2171.0,"DL236,DR1238,DR1453,DR159,DR1724,DR1752,DR1804...","VI101,VI108,VI1086,VI1099,VI1129,VI114,VI1140,..."
2,10207046,18535093389,20667948170,2,1,56,M51,2018-01-05 10:34:00,2018-01-09 08:16:01,4,3841.25,242.0,"DR1421,DR3011,DR85125,DR92591","VI108,VI1099,VI114,VI1147,VI122,VI157,VI174,VI..."
3,10501010,955225602,20961464065,3,0,61,N18,2018-02-13 08:30:43,2018-03-08 09:06:08,23,39949.7,2974.0,"DL236,DR102870,DR1194,DR1323,DR1377,DR1809,DR1...","VI108,VI1099,VI114,VI1145,VI1147,VI122,VI137,V..."
4,10501010,955225636,20910734968,2,0,61,I63,2018-02-06 23:00:00,2018-02-12 13:33:01,6,10832.87,666.5,"DL2417,DR1296,DR1377,DR1563,DR1717,DR1798,DR20...","VI108,VI1099,VI114,VI122,VI137,VI157,VI174,VI1..."


#### map code to name

In [189]:
def translate_name(x, dic):
    ls = x.split(',')
    ls_new = [dic[x.strip()].replace(' ', '') if x in dic else x for x in ls]
    return ','.join(ls_new)

In [190]:
df_patient['med_name'] = df_patient['med_code'].apply(lambda x:translate_name(x, dict_dr))
df_patient['item_name'] = df_patient['item_code'].apply(lambda x:translate_name(x, dict_vi))

In [184]:
df_patient = df_patient.groupby(['hos_id', 'disease']).filter(lambda x:x['person_id'].unique().size>=3)

In [185]:
df_patient = df_patient.set_index(['hos_id', 'disease'])

KeyError: 'hos_id'

In [186]:
df_patient.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,person_id,med_clinic_id,hos_lv,sex,age,admission_date,discharge_date,length_of_stay,amount,items_number,med_code,item_code,treatment_code,med_name,item_name
hos_id,disease,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9583046520,M51,10105032,22088875048,1,1,62,2018-07-11 08:37:06,2018-07-20 08:26:24,9,4402.41,10591.0,"DE,DR1143,DR1789,DR2285,DR2948,DR868","VI1098,VI1139,VI1145,VI1147,VI174,VI1740,VI175...","DE,DR1143,DR1789,DR2285,DR2948,DR868,VI1098,VI...","DE,腰痹通胶囊,云南白药气雾剂,甘露醇注射液,甲钴胺片,双氯芬酸钠缓释片","血细胞分析,尿液分析,粪便常规,隐血试验,血清总蛋白测定,常规心电图检查,血清白蛋白测定,类..."
955225894,K56,10107064,21822777641,3,0,83,2018-06-04 10:15:53,2018-06-18 10:11:21,14,54236.63,2171.0,"DL236,DR1238,DR1453,DR159,DR1724,DR1752,DR1804...","VI101,VI108,VI1086,VI1099,VI1129,VI114,VI1140,...","DL236,DR1238,DR1453,DR159,DR1724,DR1752,DR1804...","DL236,间苯三酚注射液,地塞米松磷酸钠注射液,枸橼酸舒芬太尼注射液,铜绿假单胞菌注射液,...","血浆凝血酶原时间测定(PT),活化部分凝血活酶时间测定(APTT),红细胞比积测定(HCT)..."
18535093389,M51,10207046,20667948170,2,1,56,2018-01-05 10:34:00,2018-01-09 08:16:01,4,3841.25,242.0,"DR1421,DR3011,DR85125,DR92591","VI108,VI1099,VI114,VI1147,VI122,VI157,VI174,VI...","DR1421,DR3011,DR85125,DR92591,VI108,VI1099,VI1...","维生素B6注射液,维生素C注射液,氯化钠注射液,腰痛宁胶囊","活化部分凝血活酶时间测定(APTT),血细胞分析,血浆纤维蛋白原测定,隐血试验,凝血酶时间测..."
955225602,N18,10501010,20961464065,3,0,61,2018-02-13 08:30:43,2018-03-08 09:06:08,23,39949.7,2974.0,"DL236,DR102870,DR1194,DR1323,DR1377,DR1809,DR1...","VI108,VI1099,VI114,VI1145,VI1147,VI122,VI137,V...","DL236,DR102870,DR1194,DR1323,DR1377,DR1809,DR1...","DL236,阿莫西林克拉维酸钾分散片(4:1),注射用青霉素钠,双环醇片,硫酸氢氯吡格雷片,...","活化部分凝血活酶时间测定(APTT),血细胞分析,血浆纤维蛋白原测定,粪便常规,隐血试验,凝..."
955225636,I63,10501010,20910734968,2,0,61,2018-02-06 23:00:00,2018-02-12 13:33:01,6,10832.87,666.5,"DL2417,DR1296,DR1377,DR1563,DR1717,DR1798,DR20...","VI108,VI1099,VI114,VI122,VI137,VI157,VI174,VI1...","DL2417,DR1296,DR1377,DR1563,DR1717,DR1798,DR20...","DL2417,酒石酸美托洛尔片,硫酸氢氯吡格雷片,盐酸倍他司汀片,注射用还原型谷胱甘肽,丙戊...","活化部分凝血活酶时间测定(APTT),血细胞分析,血浆纤维蛋白原测定,凝血酶时间测定(TT)..."


In [25]:
idx = pd.IndexSlice

In [192]:
df_patient['treatment_name'] = df_patient['med_name'] + ',' + df_patient['item_name']
df_sample = df_patient.loc[idx[:,['I63']], :]
df_sample.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,person_id,med_clinic_id,hos_lv,sex,age,admission_date,discharge_date,length_of_stay,amount,items_number,med_code,item_code,treatment_code,med_name,item_name,treatment_name
hos_id,disease,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
955225636,I63,10501010,20910734968,2,0,61,2018-02-06 23:00:00,2018-02-12 13:33:01,6,10832.87,666.5,"DL2417,DR1296,DR1377,DR1563,DR1717,DR1798,DR20...","VI108,VI1099,VI114,VI122,VI137,VI157,VI174,VI1...","DL2417,酒石酸美托洛尔片,硫酸氢氯吡格雷片,盐酸倍他司汀片,注射用还原型谷胱甘肽,丙戊...","DL2417,酒石酸美托洛尔片,硫酸氢氯吡格雷片,盐酸倍他司汀片,注射用还原型谷胱甘肽,丙戊...","活化部分凝血活酶时间测定(APTT),血细胞分析,血浆纤维蛋白原测定,凝血酶时间测定(TT)...","DL2417,酒石酸美托洛尔片,硫酸氢氯吡格雷片,盐酸倍他司汀片,注射用还原型谷胱甘肽,丙戊..."
955225898,I63,11000199,20583022603,3,0,78,2017-12-28 10:37:57,2018-01-03 06:34:21,6,3922.53,468.37,"DE,DR2562,DR387,DR726,DR783","VI2111,VI4211,VI4236,VI4254,VI4290,VI4336,VI43...","DE,奥氮平片,碳酸氢钠注射液,非那雄胺片,苯磺酸氨氯地平片,精神科A类量表测查,气压治疗,...","DE,奥氮平片,碳酸氢钠注射液,非那雄胺片,苯磺酸氨氯地平片","精神科A类量表测查,气压治疗,运动疗法,偏瘫肢体综合训练,药棒穴位按摩治疗,中药特殊调配,煎...","DE,奥氮平片,碳酸氢钠注射液,非那雄胺片,苯磺酸氨氯地平片,精神科A类量表测查,气压治疗,..."
955225893,I63,11000230,22480637602,3,0,81,2018-09-02 08:47:51,2018-09-08 00:00:00,6,11608.45,1203.0,"DE,DR130,DR1377,DR1602,DR2134,DR2202,DR2493,DR...","VI108,VI1099,VI1129,VI114,VI1147,VI122,VI157,V...","DE,瑞舒伐他汀钙片,硫酸氢氯吡格雷片,盐酸曲美他嗪片,缬沙坦胶囊,银杏酮酯分散片,塞来昔布...","DE,瑞舒伐他汀钙片,硫酸氢氯吡格雷片,盐酸曲美他嗪片,缬沙坦胶囊,银杏酮酯分散片,塞来昔布...","活化部分凝血活酶时间测定(APTT),血细胞分析,尿沉渣定量,血浆纤维蛋白原测定,隐血试验,...","DE,瑞舒伐他汀钙片,硫酸氢氯吡格雷片,盐酸曲美他嗪片,缬沙坦胶囊,银杏酮酯分散片,塞来昔布..."
955248958,I63,11000260,22980488615,1,0,73,2018-10-31 10:09:22,2018-11-14 08:00:00,14,17805.75,2373.0,"DR1797,DR2375,DR2729,DR36,DR649,DR71596,DR8512...","VI108,VI1099,VI1129,VI114,VI1145,VI122,VI157,V...","盐酸左氧氟沙星滴眼液,丹参酮ⅡA磺酸钠注射液,天麻素注射液,阿司匹林肠溶片,马来酸左旋氨氯地...","盐酸左氧氟沙星滴眼液,丹参酮ⅡA磺酸钠注射液,天麻素注射液,阿司匹林肠溶片,马来酸左旋氨氯地...","活化部分凝血活酶时间测定(APTT),血细胞分析,尿沉渣定量,血浆纤维蛋白原测定,粪便常规,...","盐酸左氧氟沙星滴眼液,丹参酮ⅡA磺酸钠注射液,天麻素注射液,阿司匹林肠溶片,马来酸左旋氨氯地..."
955225898,I63,11000599,22817051117,3,0,66,2018-10-12 08:35:42,2018-10-24 06:58:09,12,14360.92,1252.89,"DE,DL4317,DR1377,DR1699,DR199,DR1996,DR215,DR2...","VI108,VI1099,VI1129,VI1139,VI114,VI1140,VI122,...","DE,DL4317,硫酸氢氯吡格雷片,维生素B6片,丹红注射液,聚乙二醇4000散,阿卡波糖...","DE,DL4317,硫酸氢氯吡格雷片,维生素B6片,丹红注射液,聚乙二醇4000散,阿卡波糖...","活化部分凝血活酶时间测定(APTT),血细胞分析,尿沉渣定量,尿液分析,血浆纤维蛋白原测定,...","DE,DL4317,硫酸氢氯吡格雷片,维生素B6片,丹红注射液,聚乙二醇4000散,阿卡波糖..."


#### cut items

In [193]:
df_sample['treatment_name'] = df_sample['treatment_name'].apply(lambda x:x.replace(',', ' ')).values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [194]:
df_sample['treatment_name'].head()

hos_id     disease
955225636  I63        DL2417 酒石酸美托洛尔片 硫酸氢氯吡格雷片 盐酸倍他司汀片 注射用还原型谷胱甘肽 丙戊...
955225898  I63        DE 奥氮平片 碳酸氢钠注射液 非那雄胺片 苯磺酸氨氯地平片 精神科A类量表测查 气压治疗 ...
955225893  I63        DE 瑞舒伐他汀钙片 硫酸氢氯吡格雷片 盐酸曲美他嗪片 缬沙坦胶囊 银杏酮酯分散片 塞来昔布...
955248958  I63        盐酸左氧氟沙星滴眼液 丹参酮ⅡA磺酸钠注射液 天麻素注射液 阿司匹林肠溶片 马来酸左旋氨氯地...
955225898  I63        DE DL4317 硫酸氢氯吡格雷片 维生素B6片 丹红注射液 聚乙二醇4000散 阿卡波糖...
Name: treatment_name, dtype: object

### 4. topic modeling

In [195]:
# 只从文本中提取1000个最重要的特征关键词，然后停止。
n_features = 1000

#### 关键词提取和向量转换

In [196]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words='english',
                                lowercase=False,
                                max_df = 0.5,
                                min_df = 10)
tf = tf_vectorizer.fit_transform(df_sample['treatment_name'])

In [54]:
print(tf.shape)  # (NO_DOCUMENTS, NO_FEATURES)

(24082, 1000)


#### 把文章粗略划分成5个大类

In [202]:
n_topics = 20
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)


In [203]:
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=50,
                          mean_change_tol=0.001, n_components=20, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

####  Build a Latent Semantic Indexing Model

In [204]:
lsi_model = TruncatedSVD(n_components=n_topics)
lsi_Z = lsi_model.fit_transform(tf)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(24082, 20)


#### 主题没有一个确定的名称，而是用一系列关键词刻画的。我们定义以下的函数，把每个主题里面的前若干个关键词显示出来：

In [143]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


In [144]:
n_top_words = 20

In [145]:
tf_feature_names = tf_vectorizer.get_feature_names()

In [146]:
print_top_words(lda, tf_feature_names, n_top_words)


Topic #0:
vi334 vi492 vi494 vi444 vi471 vi333 dr1377 vi4447 vi4462 vi856 vi596 vi487 vi417 vi489 vi308 vi896 vi874 vi4389 vi880 vi4254
Topic #1:
vi1139 vi1140 vi332 vi330 vi4454 vi292 vi397 dr36 vi333 vi760 vi4455 dr262 vi592 vi392 vi2662 dr2595 dr2375 dr2729 vi4475 vi389
Topic #2:
vi342 vi407 vi4447 vi612 vi471 vi494 vi492 vi444 vi4460 vi2669 vi2667 vi4974 vi2752 vi2672 vi487 vi489 vi856 vi645 vi896 vi1757



### visualizing

In [205]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [206]:
data = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [207]:
pyLDAvis.show(data)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [03/Jul/2019 17:56:43] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [03/Jul/2019 17:56:43] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [03/Jul/2019 17:56:43] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [03/Jul/2019 17:56:43] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...


### 5. find the documents similarity matrix

In [76]:
text = df_sample.iloc[-2:, -1].values

In [77]:
text

array(['DR2285 DR2375 DR2595 DR262 DR36 DR85125 DR885 DS2845 VI1099 VI1129 VI1139 VI1140 VI1145 VI1147 VI1739 VI174 VI175 VI179 VI2664 VI291 VI303 VI304 VI306 VI309 VI310 VI314 VI321 VI330 VI332 VI347 VI349 VI351 VI357 VI360 VI363 VI367 VI386 VI389 VI394 VI398 VI414 VI416 VI420 VI4454 VI4463 VI4494 VI570 VI613 VI635 VI639 VI640',
       'DR1372 DR1377 DR1453 DR1851 DR2285 DR2439 DR2951 DR3311 DR3362 DR685 DR85125 DR85564 VI1099 VI1129 VI1145 VI1147 VI174 VI1740 VI175 VI2653 VI285 VI291 VI294 VI302 VI307 VI308 VI322 VI328 VI329 VI331 VI333 VI334 VI336 VI347 VI349 VI351 VI357 VI360 VI363 VI367 VI374 VI385 VI415 VI417 VI421 VI436 VI4369 VI4389 VI4450 VI4462 VI4464 VI4465 VI5184 VI5390 VI5424 VI5536 VI5569 VI570 VI592 VI612 VI622 VI633 VI635 VI639 VI640 VI645 VI815 VI820 VI852 VI896'],
      dtype=object)

In [68]:
np.array(text).reshape(-1, 1)

array([['DL2417 DR1296 DR1377 DR1563 DR1717 DR1798 DR2004 DR2064 DR2106 DR2562 DR262 DR2815 DR292 DR3099 DR3253 DR3299 DR3300 DR333 DR3342 DR598 DR607 DR729 DR85125 DR85539 DR88628 VI108 VI1099 VI114 VI122 VI137 VI157 VI174 VI175 VI1765 VI179 VI1914 VI2381 VI2385 VI248 VI262 VI2644 VI2657 VI2663 VI2667 VI2669 VI2672 VI2679 VI2689 VI2720 VI2730 VI2732 VI2735 VI2752 VI2755 VI2760 VI2762 VI2770 VI2777 VI2805 VI2823 VI2839 VI2840 VI285 VI2859 VI291 VI292 VI294 VI303 VI304 VI306 VI309 VI310 VI314 VI321 VI328 VI329 VI330 VI331 VI332 VI333 VI334 VI336 VI342 VI343 VI347 VI349 VI357 VI358 VI360 VI363 VI367 VI385 VI394 VI408 VI414 VI415 VI416 VI417 VI420 VI427 VI4389 VI4393 VI444 VI4441 VI4454 VI4455 VI4462 VI4463 VI4464 VI4465 VI452 VI471 VI487 VI4883 VI489 VI492 VI494 VI4974 VI4988 VI5153 VI5429 VI5569 VI570 VI589 VI596 VI612 VI629 VI633 VI635 VI639 VI640 VI645 VI815 VI820 VI852 VI853 VI856 VI866 VI872 VI874 VI880 VI884 VI885 VI99']],
      dtype='<U927')

In [153]:
doc_top_prob = lda.transform(tf)

In [154]:
df_doc_top_prob = pd.DataFrame(index=df_sample.index, data=doc_top_prob)

In [160]:
df_doc_top_prob.std(axis=1).mean()

0.45693440157777304

In [156]:
# topic for one hospital
df_doc_top_prob.loc[idx['955225901',:], :].idxmax(axis=1).value_counts()

0    1105
dtype: int64

In [92]:
# topics for all hospitals
df_doc_top_prob.idxmax(axis=1).value_counts()

3    8900
4    4216
2    4124
0    3924
1    2918
dtype: int64

In [134]:
df_sample.groupby('hos_id').count().iloc[:, 0].reset_index(name='count') \
                             .sort_values(['count'], ascending=False)

Unnamed: 0,hos_id,count
79,955225892,2501
80,955225893,1937
47,2554396377,1631
83,955225901,1105
95,955248958,1036
76,955225874,925
82,955225898,875
40,2063436818,835
59,955225602,829
54,5723622765,822


In [116]:
# get the max of value counts
s_max_value_counts = df_doc_top_prob.idxmax(axis=1).groupby('hos_id').apply(lambda x:x.value_counts().index[0])
s_max_value_counts

hos_id
10597212433    3
10601481112    4
10719115947    1
10722491513    3
10916307828    3
1108597310     0
11756385832    0
1301546966     3
1328140101     0
15318919475    3
16193847389    0
17184816822    3
1762022643     3
17847046864    2
18535093389    0
1930063369     3
2027291605     3
20580083304    3
2062675187     0
2063102484     3
2063105192     4
2063109125     3
2063111963     3
2063126974     3
2063134865     3
2063139045     3
2063152969     0
2063155039     3
2063180308     4
2063182425     3
              ..
955225781      3
955225782      3
955225821      3
955225874      1
955225883      3
955225884      3
955225892      4
955225893      0
955225894      2
955225898      2
955225901      3
955225902      4
955225903      3
955225908      3
955225912      3
955225913      3
955225914      3
955225915      3
955225917      3
955225918      1
955225954      3
955248956      3
955248958      3
955248959      3
955248960      3
955248962      3
955248963      3
9552489

In [125]:
s_max_value_counts.value_counts()
# total hospital number = 135
# total I63 hospital number = 103

3    73
0    10
4     8
2     7
1     5
dtype: int64