<a href="https://colab.research.google.com/github/hardeybisey/neural-network/blob/main/dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install datasets
# !pip install transformers
# !wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
# !unzip drugsCom_raw.zip

In [69]:
from datasets import load_dataset
from transformers import AutoTokenizer
import html

# Reading data with load_dataset

In [None]:
data_files = {"train":"drugsComTrain_raw.tsv","test":"drugsComTest_raw.tsv"}
drug_data = load_dataset("csv", data_files=data_files, delimiter="\t")

In [57]:
drug_data

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [58]:
for split in drug_data.keys():
  assert len(drug_data[split]) == len(drug_data[split].unique('Unnamed: 0'))

# Appplying Transformation on the dataset

In [65]:
drug_data = drug_data.rename_columns({'Unnamed: 0': 'patient_id'})
drug_data = drug_data.filter(lambda x : x['condition'] != None) # remove rows with no condition
drug_data = drug_data.map(lambda x: {"condition": x['condition'].lower()}) # change all condition to lower case
drug_data = drug_data.map(lambda x: {"review_length": len(x['review'].split())}) # compute review length
drug_data = drug_data.filter(lambda x: x['review_length'] > 30) # remove rows with review_length less than 30
drug_data = drug_data.map(lambda x: {"review":[html.unescape(o) for o in x["review"]]}, batched=True) # clean the reviews from any html characters

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

In [96]:
drug_sample = drug_data['train'].shuffle(seed=42).select(range(1000))
drug_sample.set_format("pandas")
drug_sample[:3]



Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,191114,Campral,alcohol dependence,"""Sober a year 8-25-11. God, AA and Campral hav...",10.0,"September 3, 2011",33,41
1,142693,Levonorgestrel,birth control,"""I've been on birth control for a while now du...",4.0,"August 9, 2017",3,140
2,71561,Vraylar,bipolar disorde,"""Hi, this is an updated experience. \r\r\n\r\r...",8.0,"August 16, 2016",12,131


In [74]:
drug_data

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 138514
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [159]:
def tokenize_split(examples):
    result = tokenizer(examples["review"], truncation=True, max_length=128, return_overflowing_tokens=True)
    sample_map = result.pop("overflow_to_sample_mapping")
    for key , values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [169]:
tokenized_text = drug_data.map(tokenize_split, batched=True)



In [171]:
drug_data.set_format("pandas")
drug_data['train'][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [172]:
train_df = drug_data['train'][:]

In [178]:
freq = train_df['condition'].value_counts().reset_index().rename(columns={"index":"condition","condition":"frequency"})
freq.head()

Unnamed: 0,condition,frequency
0,birth control,27655
1,depression,8023
2,acne,5209
3,anxiety,4991
4,pain,4744


# Creating a dataset object from a pandas dataframe

In [200]:
from datasets import Dataset , load_from_disk
freq_dataset = Dataset.from_pandas(freq)
freq_dataset

Dataset({
    features: ['condition', 'frequency'],
    num_rows: 819
})

# preparing data for machine learning algorithm

In [188]:
drug_data.reset_format()

In [None]:
drug_data_clean = drug_data["train"].train_test_split(train_size=0.8, seed=42)
drug_data_clean["validation"] = drug_data_clean.pop("test")
drug_data_clean["test"] = drug_data["test"]

In [194]:
drug_data_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

# saving and loading on disk

In [210]:
# arrow_format
drug_data_clean.save_to_disk("drug-review-arrow")
drug_data_from_disk = load_from_disk("drug-review-arrow")


# saving csv/ json
for split , data in drug_data_clean.items():
    data.to_json(f"drug-review{split}.jsonl")
    data.to_csv(f"drug-review{split}.csv")

# loading csv/json
data_files = {"train":"drug-reviewtrain.csv","test":"drug-reviewtest.csv","val":"drug-reviewvalidation.csv"}
drug_data_from_disk2 = load_dataset("csv" , data_files=data_files)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-683b85296e720d87/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-683b85296e720d87/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [211]:
drug_data_from_disk2

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
    val: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
})

In [206]:
!head -n 5 drug-reviewtest.jsonl

{"patient_id":163740,"drugName":"Mirtazapine","condition":"depression","review":"\"I've tried a few antidepressants over the years (citalopram, fluoxetine, amitriptyline), but none of those helped with my depression, insomnia & anxiety. My doctor suggested and changed me onto 45mg mirtazapine and this medicine has saved my life. Thankfully I have had no side effects especially the most common - weight gain, I've actually lost alot of weight. I still have suicidal thoughts but mirtazapine has saved me.\"","rating":10.0,"date":"February 28, 2012","usefulCount":22,"review_length":68}
{"patient_id":206473,"drugName":"Mesalamine","condition":"crohn's disease, maintenance","review":"\"My son has Crohn's disease and has done very well on the Asacol.  He has no complaints and shows no side effects.  He has taken as many as nine tablets per day at one time.  I've been very happy with the results, reducing his bouts of diarrhea drastically.\"","rating":8.0,"date":"May 17, 2009","usefulCount":17,

In [207]:
!head -n 5 drug-reviewtest.csv

patient_id,drugName,condition,review,rating,date,usefulCount,review_length
163740,Mirtazapine,depression,"""I've tried a few antidepressants over the years (citalopram, fluoxetine, amitriptyline), but none of those helped with my depression, insomnia & anxiety. My doctor suggested and changed me onto 45mg mirtazapine and this medicine has saved my life. Thankfully I have had no side effects especially the most common - weight gain, I've actually lost alot of weight. I still have suicidal thoughts but mirtazapine has saved me.""",10.0,"February 28, 2012",22,68
206473,Mesalamine,"crohn's disease, maintenance","""My son has Crohn's disease and has done very well on the Asacol.  He has no complaints and shows no side effects.  He has taken as many as nine tablets per day at one time.  I've been very happy with the results, reducing his bouts of diarrhea drastically.""",8.0,"May 17, 2009",17,48
39293,Contrave,weight loss,"""Contrave combines drugs that were used for alcohol, smoking, and op