In [1]:
import numpy as np
import nltk
import gensim
import re
import logging
import csv
import sklearn.ensemble
import sklearn.metrics
import sklearn.model_selection

In [33]:
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ville\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
stopwords = nltk.corpus.stopwords.words('spanish')

In [4]:
def normalizer(text):
    text = text.lower()
    text = re.sub(r'[^A-Za-zñáéíóú]', ' ', text)
    text = re.sub('á', 'a', text)
    text = re.sub('é', 'e', text)
    text = re.sub('í', 'i', text)
    text = re.sub('ó', 'o', text)
    text = re.sub('ú', 'u', text)
    return text

In [5]:
def vectorizer(text, model):
    vectors = []
    for i in text:
        try:
            vectors.append(model.wv[i])
        except:
            pass
    return(np.mean(vectors,axis=0))

In [6]:
sentences = []
with open('corpus.txt') as file:
    for line in file:
        sentences.append((line.rstrip()))

In [7]:
normalized_sentences = [normalizer(sentence) for sentence in sentences]

In [8]:
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in normalized_sentences]

In [9]:
without_stopwords_sentences = []
for sentence in tokenized_sentences:
    without_stopwords_sentence = [word for word in sentence if word not in stopwords]
    without_stopwords_sentences.append(without_stopwords_sentence)

In [10]:
model = gensim.models.Word2Vec(without_stopwords_sentences)

2019-06-02 23:35:18,330 : INFO : collecting all words and their counts
2019-06-02 23:35:18,331 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-06-02 23:35:18,336 : INFO : PROGRESS: at sentence #10000, processed 26393 words, keeping 2174 word types
2019-06-02 23:35:18,341 : INFO : PROGRESS: at sentence #20000, processed 53049 words, keeping 2795 word types
2019-06-02 23:35:18,346 : INFO : PROGRESS: at sentence #30000, processed 80100 words, keeping 3189 word types
2019-06-02 23:35:18,351 : INFO : PROGRESS: at sentence #40000, processed 107473 words, keeping 3528 word types
2019-06-02 23:35:18,356 : INFO : PROGRESS: at sentence #50000, processed 134970 words, keeping 3783 word types
2019-06-02 23:35:18,361 : INFO : PROGRESS: at sentence #60000, processed 162957 words, keeping 3986 word types
2019-06-02 23:35:18,366 : INFO : PROGRESS: at sentence #70000, processed 191225 words, keeping 4140 word types
2019-06-02 23:35:18,371 : INFO : PROGRESS: at sentence #

In [11]:
model.most_similar('cancer')

  """Entry point for launching an IPython kernel.
2019-06-02 23:35:24,679 : INFO : precomputing L2-norms of word weight vectors


[('avanzado', 0.884432315826416),
 ('fallecido', 0.8637561202049255),
 ('adenocarcinoma', 0.862040638923645),
 ('metastasis', 0.8620170950889587),
 ('adenoma', 0.8476477861404419),
 ('infiltrante', 0.8386375904083252),
 ('cervix', 0.8291419744491577),
 ('indeterminado', 0.8254727721214294),
 ('significado', 0.8203879594802856),
 ('fundus', 0.8197342157363892)]

In [12]:
diagnostics = []
specialties = []
with open('data.csv', encoding='utf-8') as file:
    data = csv.DictReader(file)
    for row in data:
        diagnostics.append(row['diagnostic'])
        specialties.append(row['specialty'])

In [13]:
diagnostics_normalized = [normalizer(diagnostic) for diagnostic in diagnostics]

In [14]:
diagnostics_tokenized = [nltk.word_tokenize(diagnostic) for diagnostic in diagnostics_normalized]

In [15]:
diagnostics_wihout_stopwords = []
for diagnostic in diagnostics_tokenized:
    diagnostic_wihout_stopwords = [word for word in diagnostic if word not in stopwords]
    diagnostics_wihout_stopwords.append(diagnostic_wihout_stopwords)

In [17]:
diagnostics_matrix = np.zeros((len(diagnostics_wihout_stopwords), len(model['cancer'])))

  """Entry point for launching an IPython kernel.


In [18]:
for i,diagnostic in enumerate(diagnostics_wihout_stopwords):
    vector = vectorizer(diagnostic,model)
    diagnostics_matrix[i,] = vector

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [19]:
specialties_vector = np.zeros((len(specialties), 1))

In [20]:
for i,specialty in enumerate(specialties):
    if specialty == 'OFTALMOLOGIA':
        specialties_vector[i,] = 1
    else:
        specialties_vector[i,] = 2

In [21]:
data_matrix = np.concatenate([diagnostics_matrix,specialties_vector], axis=1)

In [22]:
data_matrix_without_nan = data_matrix[~np.isnan(data_matrix).any(axis=1)]

In [23]:
diagnostics_train, diagnostics_test, specialties_train, specialties_test = sklearn.model_selection.train_test_split(
    data_matrix_without_nan[:,:100],
    data_matrix_without_nan[:,100],
    test_size=0.33,
    random_state=42
)

In [24]:
classifier = sklearn.ensemble.RandomForestClassifier()

In [25]:
classifier.fit(diagnostics_train,specialties_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
predictions = classifier.predict(diagnostics_test)
print(sklearn.metrics.classification_report(predictions, specialties_test))

              precision    recall  f1-score   support

         1.0       0.78      0.98      0.87      5621
         2.0       0.98      0.82      0.89      8534

   micro avg       0.88      0.88      0.88     14155
   macro avg       0.88      0.90      0.88     14155
weighted avg       0.90      0.88      0.88     14155



In [30]:
def specialtyClassifier(diagnostic):
    try:
        stringNorm = normalizer(diagnostic)
        stringTokenized = nltk.word_tokenize(stringNorm)
        stringVec = vectorizer(stringTokenized,model)
        result = classifier.predict(stringVec.reshape(1, -1))[0]
        return(result)
    except:
        return(np.nan)

In [32]:
specialtyClassifier('vicio de refracción')

1.0