# iDENTify Experiment Notebook
###### By: Jason Park

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

### Experiment Settings
> variables to change on your computer

In [2]:
raw_test_data = 'drugsComTest_raw.tsv'
raw_train_data = 'drugsComTrain_raw.tsv'
percent_training = 0.8

### Data Collection
Kaggle Data: <https://www.kaggle.com/datasets/jessicali9530/kuc-hackathon-winter-2018?select=drugsComTrain_raw.csv>  
UCI Data: <https://archive.ics.uci.edu/ml/datasets/Drug+Review+Dataset+%28Drugs.com%29>  

In [3]:
test_data = pd.read_csv(raw_test_data, delimiter='\t')
train_data = pd.read_csv(raw_train_data, delimiter='\t')

### Feature Extraction
##### are there any reviews that mention the word "dentist"

In [4]:
test_data_reviews = test_data['review']
drugName = test_data['drugName']
containsDentist = []
r = 0
for i in test_data_reviews:
    if "root canal" in i:
        containsDentist.append([drugName[r], i])
    r += 1
containsDentist

[['Keflex',
  '"I was prescribed Keflex for an infection that occurred after root canal went bad. Evidently the previous dentist had not been able to successfully remove all of the nerve and pulp and over time an infection resulted.\r\n\r\nKeflex was well tolerated by me and helped to reduce the swelling within the gum until I was able to have a root canal retreatment in order to remove the rest of the nerve and pulp.\r\n\r\nThe rating that I provided is due to the fact that this medication required 4 doses daily which I felt there must have been some other medication available that required less commitment. Still, it did work and I would take it again if prescribed."'],
 ['Acetaminophen / oxycodone',
  '"I have an infected tooth with a dead nerve that needed a root canal. I got half the root canal done and need to wait for the infection to clear up. I&#039;m taking major antibiotics but it&#039;s still taking a while and I&#039;m in a lot of pain. The percocet has helped immensely. I 

### Drug Names to Numbers

In [5]:
test_drugs = [drug for drug in test_data['drugName']]
train_drugs = [drug for drug in train_data['drugName']]
all_drugs = test_drugs + train_drugs
drug_dict = {}
i = 0
for drug in set(all_drugs):
    drug_dict[drug] = i
    i += 1

### Columns to create

In [6]:
ideal_columns = 'Person Drug Name Rating Age Gender Ethnicity dental procedure Will adverse drug reaction occur Details'.split('\t')
ideal_columns

['Person Drug Name Rating Age Gender Ethnicity dental procedure Will adverse drug reaction occur Details']

In [7]:
ideal_columns = 'Person Drug Name Rating Age Gender Ethnicity dental procedure Will adverse drug reaction occur Details'.split('\t')
ideal_columns
def create_identify_data_set(row):
    if 'bleeding' in row['review']:
        occur = 1
    else:
        occur = 0
    return {
        'Drug Name': drug_dict[row['drugName']],
        'Rating': row['rating'],
        'Will adverse drug reaction occur': occur
    }

In [8]:
arr = []
for index, row in train_data.iterrows():
    arr.append(create_identify_data_set(row))

identify_train = pd.DataFrame(arr)
identify_train

Unnamed: 0,Drug Name,Rating,Will adverse drug reaction occur
0,2980,9.0,0
1,3087,8.0,0
2,1587,5.0,0
3,2480,8.0,0
4,1353,9.0,0
...,...,...,...
161292,65,10.0,0
161293,548,1.0,0
161294,2216,2.0,0
161295,49,10.0,0


In [9]:
for index, row in test_data.iterrows():
    arr.append(create_identify_data_set(row))

identify_test = pd.DataFrame(arr)
identify_test

Unnamed: 0,Drug Name,Rating,Will adverse drug reaction occur
0,2980,9.0,0
1,3087,8.0,0
2,1587,5.0,0
3,2480,8.0,0
4,1353,9.0,0
...,...,...,...
215058,1412,10.0,0
215059,1028,9.0,0
215060,2029,8.0,0
215061,2429,1.0,0


### Resampeling the DataSet (randomly choose question to ask AI)

In [10]:
#merge our dataset into one
identify_all = pd.concat([identify_test, identify_train], axis = 0)
identify_all

Unnamed: 0,Drug Name,Rating,Will adverse drug reaction occur
0,2980,9.0,0
1,3087,8.0,0
2,1587,5.0,0
3,2480,8.0,0
4,1353,9.0,0
...,...,...,...
161292,65,10.0,0
161293,548,1.0,0
161294,2216,2.0,0
161295,49,10.0,0


In [11]:
questions = identify_all.loc[:, identify_all.columns != 'Will adverse drug reaction occur']
questions

Unnamed: 0,Drug Name,Rating
0,2980,9.0
1,3087,8.0
2,1587,5.0
3,2480,8.0
4,1353,9.0
...,...,...
161292,65,10.0
161293,548,1.0
161294,2216,2.0
161295,49,10.0


In [12]:
answers = identify_all.loc[:, identify_all.columns == 'Will adverse drug reaction occur']
answers

Unnamed: 0,Will adverse drug reaction occur
0,0
1,0
2,0
3,0
4,0
...,...
161292,0
161293,0
161294,0
161295,0


In [13]:
training_questions,test,training_answers,answer_sheet = train_test_split(questions, answers, train_size=percent_training)
training_questions,test,training_answers,answer_sheet

(        Drug Name  Rating
 15535        2934    10.0
 55113        1491     8.0
 7200         1491    10.0
 24267        1544     9.0
 105397       1491     8.0
 ...           ...     ...
 47222        1713     1.0
 113016       3490     6.0
 32871        2762    10.0
 53786        2053    10.0
 81954        1855    10.0
 
 [301088 rows x 2 columns],
         Drug Name  Rating
 180584       3045     9.0
 25052        2894     7.0
 1647         2298     7.0
 56869        2836    10.0
 52548        1341     6.0
 ...           ...     ...
 13295        1028    10.0
 172514        186     1.0
 184475       2657     7.0
 157975        802     9.0
 151926        867     5.0
 
 [75272 rows x 2 columns],
         Will adverse drug reaction occur
 15535                                  0
 55113                                  0
 7200                                   0
 24267                                  0
 105397                                 0
 ...                                  ...

### Teach our AI

In [14]:
#the methodology we are using to teach our AI
classifier = RandomForestClassifier()

#apply that methodology to teach our AI with the training questions and answers
ai = classifier.fit(training_questions, training_answers.values.ravel())
ai

RandomForestClassifier()

### Testing our AI

In [15]:
test_results = ai.predict(test)
test_results

array([0, 0, 0, ..., 0, 0, 0])

In [16]:
answer_sheet = [a for a in answer_sheet['Will adverse drug reaction occur']]
answer_sheet

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [17]:
correct = 0
wrong = 0
i = 0
for test_result in test_results:
    if test_result == answer_sheet[i]:
        correct += 1
    else:
        wrong += 1
    i += 1

{"correct": correct, "wrong": wrong}

{'correct': 71574, 'wrong': 3698}

## Learning From

list of drugs
dr. pak: 