In [1]:
import pandas as pd
from nltk.corpus import stopwords
import re
import requests
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')

In [2]:
concept_df = pd.read_csv('концепты.tsv', sep='\t')
concept_df.head()

Unnamed: 0,CUI,CONCEPT,SNOMED_CODE
0,C0000765,"Excessive body weight gain,Excessive weight gain",224994002
1,C0701811,"Poor short-term memory,Poor short-term memory",247592009
2,C0002170,"Alopecia,Loss of hair",278040002
3,C0549622,"Sexual Dysfunction,Sexual disorder",231532002
4,C0027497,"Nausea,Nausea",422587007


In [3]:
concept_df['CONCEPT'] = concept_df['CONCEPT'].str.lower()
concept_df['CONCEPT'] = concept_df['CONCEPT'].str.split(',')
concept_df['CONCEPT'] = concept_df['CONCEPT'].apply(lambda x: [item for item in x if str(item) not in stopwords.words('english')])
concept_df['CONCEPT'] = concept_df['CONCEPT'].apply(lambda x: set(x))
concept_df['CUI'] = concept_df['CUI'].apply(lambda x: re.sub(' ', '', x))

concept_df.head()

Unnamed: 0,CUI,CONCEPT,SNOMED_CODE
0,C0000765,"{excessive weight gain, excessive body weight ...",224994002
1,C0701811,{poor short-term memory},247592009
2,C0002170,"{alopecia, loss of hair}",278040002
3,C0549622,"{sexual dysfunction, sexual disorder}",231532002
4,C0027497,{nausea},422587007


In [5]:
def description(cui):
    try:
        pre_url = 'http://sideeffects.embl.de/se/'
        url = pre_url + cui

        resp = requests.get(url)
        soup = BeautifulSoup(resp.text,'html.parser')

        res = soup.find("div",{"class":"boxDiv"}).get_text()
        res = re.findall(r':(.+?)\n', res)

        res = set(res[0].split())
        res = [item for item in res if item not in stopwords.words('english')]

        return res
    
    except AttributeError:
        res_1 = concept_df['CONCEPT'][concept_df.loc[concept_df['CUI'] == cui].index[0]]
        return res_1

In [6]:
concept_df['DESCRIPTION'] = concept_df['CUI'].apply(lambda x: description(x))

In [7]:
concept_df.head(10)

Unnamed: 0,CUI,CONCEPT,SNOMED_CODE,DESCRIPTION
0,C0000765,"{excessive weight gain, excessive body weight ...",224994002,"{excessive weight gain, excessive body weight ..."
1,C0701811,{poor short-term memory},247592009,"[loss,, short, short-term, memory, short-, los..."
2,C0002170,"{alopecia, loss of hair}",278040002,"[bald, autoimmune, disorders, result, entire, ..."
3,C0549622,"{sexual dysfunction, sexual disorder}",231532002,"[function,, decrease, Sexual, dysfunc-, sexual..."
4,C0027497,{nausea},422587007,"[various, stomach, unpleasant, sea, usually, e..."
5,C0344232,"{blurred vision, blurring of visual image}",111516008,"[Encyclopedia), Blurred, resulting, loss, visi..."
6,C0037316,{sleep deprivation},130989002,{sleep deprivation}
7,C0424000,{feeling suicidal},225457007,"[A, minority, suicide, engage, factor, common,..."
8,C0424565,{cannot sleep at all},248255005,{cannot sleep at all}
9,C1971624,{loss of appetite},79890006,"[appetite,, lost, appetite, loss, lossof, abse..."


In [8]:
col_list = ['CUI', 'DESCRIPTION']
dictionary_df = concept_df[col_list]

dictionary_df.head()

Unnamed: 0,CUI,DESCRIPTION
0,C0000765,"{excessive weight gain, excessive body weight ..."
1,C0701811,"[loss,, short, short-term, memory, short-, los..."
2,C0002170,"[bald, autoimmune, disorders, result, entire, ..."
3,C0549622,"[function,, decrease, Sexual, dysfunc-, sexual..."
4,C0027497,"[various, stomach, unpleasant, sea, usually, e..."


In [9]:
dictionary_df['DESCRIPTION'] = dictionary_df['DESCRIPTION'].apply(lambda x: ','.join(x))
dictionary_df.head()

Unnamed: 0,CUI,DESCRIPTION
0,C0000765,"excessive weight gain,excessive body weight gain"
1,C0701811,"loss,,short,short-term,memory,short-,loss,term"
2,C0002170,"bald,autoimmune,disorders,result,entire,pullin..."
3,C0549622,"function,,decrease,Sexual,dysfunc-,sexual,decr..."
4,C0027497,"various,stomach,unpleasant,sea,usually,early,s..."


In [10]:
dictionary_df.to_csv('dictionary.csv', sep='\t', index=False)

In [11]:
diction = {}

with open("dictionary.txt") as f:
    for line in f:
        (key, val) = line.split('\t')
        val = re.sub('\n', '', val)
        diction[key] = val.split(',')
        
# diction

In [12]:
complaints = pd.read_csv('побочные_эффекты.txt', delimiter='\t')
complaints.head()

Unnamed: 0,id,Text
0,1,"extreme weight gain, short-term memory loss, h..."
1,2,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .
2,3,Just TWO tablets of Lexapro 10mg completely de...
3,4,It's called PSSD: post-SSRI sexual dysfunction.
4,5,And there is a chance that it will give you PS...


In [13]:
complaints['Text'] = complaints['Text'].str.lower()
complaints['Text'] = complaints['Text'].str.split(',')
complaints['Text'] = complaints['Text'].apply(lambda x: [item for item in x if str(item) not in stopwords.words('english')])
complaints['Text'] = complaints['Text'].apply(lambda x: ''.join(x))

complaints.head()

Unnamed: 0,id,Text
0,1,"[extreme weight gain, short-term memory loss,..."
1,2,[completely destroyed sexually functioning .]
2,3,[just two tablets of lexapro 10mg completely d...
3,4,[it's called pssd: post-ssri sexual dysfunction.]
4,5,[and there is a chance that it will give you p...


In [14]:
def mapping(text, values):

    res = []

    for word in text:
        for k in values:
            for v in values[k]:
                if word in v:
                    res.append(k)
                    
    res = set(res)

    return ', '.join(res)

In [16]:
complaints['Text'] = complaints['Text'].apply(lambda x: mapping(x, diction))
complaints.head()

Unnamed: 0,id,Text
0,1,"C0027497, C0701812, C0149931, C1456784, C03147..."
1,2,"C0027497, C0701812, C0149931, C1456784, C03147..."
2,3,"C0027497, C0701812, C1456784, C0149931, C03147..."
3,4,"C0027497, C0701812, C0149931, C1456784, C03147..."
4,5,"C0027497, C0701812, C0149931, C1456784, C03147..."


In [17]:
complaints.to_csv('tychina_3.csv', sep='\t', index=False)