# Import Libraries

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

import os
from os import path
import pickle

import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

# Data Pre-Processing

## Drugs.com Dataset

In [2]:
drugs_com_test = pd.read_csv('./data/drugsComTest_raw.tsv', delimiter='\t', encoding = 'utf-8')
drugs_com_train = pd.read_csv('./data/drugsComTrain_raw.tsv', delimiter='\t', encoding = 'utf-8')

In [3]:
print(drugs_com_test.info())
drugs_com_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53766 entries, 0 to 53765
Data columns (total 7 columns):
Unnamed: 0     53766 non-null int64
drugName       53766 non-null object
condition      53471 non-null object
review         53766 non-null object
rating         53766 non-null float64
date           53766 non-null object
usefulCount    53766 non-null int64
dtypes: float64(1), int64(2), object(4)
memory usage: 2.9+ MB
None


Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4


In [4]:
print(drugs_com_train.info())
drugs_com_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161297 entries, 0 to 161296
Data columns (total 7 columns):
Unnamed: 0     161297 non-null int64
drugName       161297 non-null object
condition      160398 non-null object
review         161297 non-null object
rating         161297 non-null float64
date           161297 non-null object
usefulCount    161297 non-null int64
dtypes: float64(1), int64(2), object(4)
memory usage: 8.6+ MB
None


Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


For both the testing and training sets of the
drugs.com data, the only columns with NaN values is the condition column. The number of NaNs in both training and test sets are less than 1% of the total data so I simply drop those rows from the corpus.

In [5]:
drugs_com_test = drugs_com_test.dropna()
drugs_com_train = drugs_com_train.dropna()

The downloaded data included html encoding for a couple of characters such as the ', ", >, <, &. Convert these back into the proper form using a string replacement. Another method to try to convert all of the html strings to actual text would be to use the Python html parser function.

In [6]:
drugs_com_test.replace(regex=r'&#039;', value='\'', inplace=True)
drugs_com_test.replace(regex=r'&quot;', value='"', inplace=True)
drugs_com_test.replace(regex=r'&lt;', value='<', inplace=True)
drugs_com_test.replace(regex=r'&gt;', value='>', inplace=True)
drugs_com_test.replace(regex=r'&amp;', value='&', inplace=True)

drugs_com_train.replace(regex=r'&#039;', value='\'', inplace=True)
drugs_com_train.replace(regex=r'&quot;', value='"', inplace=True)
drugs_com_train.replace(regex=r'&lt;', value='<', inplace=True)
drugs_com_train.replace(regex=r'&gt;', value='>', inplace=True)
drugs_com_train.replace(regex=r'&amp;', value='&', inplace=True)

There are a number of conditions that are not correctly identifies and have the text "... users found this comment helpful." These are not complete entries and therefore will not be helpful during this analysis. Again, they do not make up a significant portion of the dataset so I will just remove those rows completely.

In [7]:
df_com_test = drugs_com_test[~drugs_com_test['condition'].str.contains('users found this comment helpful')]
df_com_train = drugs_com_train[~drugs_com_train['condition'].str.contains('users found this comment helpful')]

Convert the date column to a datetime type so it can be aggregated based on month, year, etc.

In [8]:
df_com_train['date'] = pd.to_datetime(df_com_train['date'])
df_com_test['date'] = pd.to_datetime(df_com_test['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Drugslib.com Dataset

In [9]:
drugs_lib_test = pd.read_csv('./data/drugLibTest_raw.tsv', delimiter='\t', encoding = 'utf-8')
drugs_lib_train = pd.read_csv('./data/drugLibTrain_raw.tsv', delimiter='\t', encoding = 'utf-8')

In [10]:
print(drugs_lib_train.info())
drugs_lib_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3107 entries, 0 to 3106
Data columns (total 9 columns):
Unnamed: 0           3107 non-null int64
urlDrugName          3107 non-null object
rating               3107 non-null int64
effectiveness        3107 non-null object
sideEffects          3107 non-null object
condition            3106 non-null object
benefitsReview       3107 non-null object
sideEffectsReview    3105 non-null object
commentsReview       3099 non-null object
dtypes: int64(2), object(7)
memory usage: 218.5+ KB
None


Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,2202,enalapril,4,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ..."
1,3117,ortho-tri-cyclen,1,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest..."
2,1146,ponstel,10,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...
3,3947,prilosec,3,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...
4,1951,lyrica,2,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above


In [11]:
print(drugs_lib_test.info())
drugs_lib_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1036 entries, 0 to 1035
Data columns (total 9 columns):
Unnamed: 0           1036 non-null int64
urlDrugName          1036 non-null object
rating               1036 non-null int64
effectiveness        1036 non-null object
sideEffects          1036 non-null object
condition            1036 non-null object
benefitsReview       1036 non-null object
sideEffectsReview    1036 non-null object
commentsReview       1036 non-null object
dtypes: int64(2), object(7)
memory usage: 72.9+ KB
None


Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,1366,biaxin,9,Considerably Effective,Mild Side Effects,sinus infection,The antibiotic may have destroyed bacteria cau...,"Some back pain, some nauseau.",Took the antibiotics for 14 days. Sinus infect...
1,3724,lamictal,9,Highly Effective,Mild Side Effects,bipolar disorder,Lamictal stabilized my serious mood swings. On...,"Drowsiness, a bit of mental numbness. If you t...",Severe mood swings between hypomania and depre...
2,3824,depakene,4,Moderately Effective,Severe Side Effects,bipolar disorder,Initial benefits were comparable to the brand ...,"Depakene has a very thin coating, which caused...",Depakote was prescribed to me by a Kaiser psyc...
3,969,sarafem,10,Highly Effective,No Side Effects,bi-polar / anxiety,It controlls my mood swings. It helps me think...,I didnt really notice any side effects.,This drug may not be for everyone but its wond...
4,696,accutane,10,Highly Effective,Mild Side Effects,nodular acne,Within one week of treatment superficial acne ...,Side effects included moderate to severe dry s...,Drug was taken in gelatin tablet at 0.5 mg per...


Only the training set for the Drugs Lib data has NaN values. Since there were only a six rows, I chose to delete those completely from the dataset as minimal informaiton would be lost.

In [12]:
drugs_lib_train = drugs_lib_train.dropna()

## Clean Text

The NLP package used to clean and tokenize the text is [spaCy](https://spacy.io/models/en#en_core_web_lg). spaCy assigns word vectors, creates token vectors, part of speech tags, and extracts entities, among a large list of other textual processing tasks. The large package contains 685k unique vectors. 

In [13]:
nlp = spacy.load('en_core_web_lg')

In [14]:
# create stopwords list
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
custom_stops = ['health', 'healthy', 'year', 'day', 'start', 
                'month', 'week', 'pill', 'drug', 'ago']
spacy_stopwords.update(set(custom_stops))

The spaCy stopwords set contains 312 words such as 'am', 'almost', 'was', 'forty', etc. However, due to the nature of the reviews being drug and health related, I added a few of my own words to the stopwords list to assist in pinpointing the text analysis.

In [15]:
def clean_text(docs):
    
    lower_docs = [d.lower() for d in docs]
    
    table = str.maketrans({key: None for key in string.punctuation + string.digits})
    clean_docs = [d.translate(table) for d in lower_docs]

    nlp_docs = [nlp(d) for d in clean_docs]

    lemmatized_docs = [[w.lemma_ if w.lemma_ != '-PRON-' else w.lower_ for w in d] for d in nlp_docs]
    
    pos_docs = [[w.pos_ for w in d] for d in nlp_docs]
    
    lemmatized_docs = [[lemma for lemma in doc if lemma not in spacy_stopwords] for doc in lemmatized_docs]

    clean_reviews = [' '.join(l) for l in lemmatized_docs]
    
    return clean_reviews, lemmatized_docs, pos_docs

In [16]:
# Used this to test code prior to completing on entire dataset
df_sub = df_com_train.sample(100, random_state=42)
df_sub.reset_index(inplace = True)

In [17]:
%%time 
df_sub['review_clean'], df_sub['review_tokens'], df_sub['pos'] = \
clean_text(df_sub['review'])

Wall time: 2.56 s


## Drugs.com Dataset

In [18]:
%%time 
#training set
df_com_train['review_clean'], df_com_train['review_tokens'], df_com_train['pos'] = clean_text(df_com_train['review'])

#testing set
df_com_test['review_clean'], df_com_test['review_tokens'], df_com_test['pos'] = clean_text(df_com_test['review'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Wall time: 1h 32min 7s


Create a single data set that combines the testing and training sets so that all the data can be explored at once in the EDA portion of this analysis.

In [19]:
df_com_full = df_com_train.append(df_com_test)

## Drugslib.com Dataset

In [20]:
%%time 
# training set
drugs_lib_train['benefits_clean'], drugs_lib_train['benefits_tokens'], drugs_lib_train['b_pos'] = \
clean_text(drugs_lib_train['benefitsReview'])
drugs_lib_train['sideEffects_clean'], drugs_lib_train['sideEffects_tokens'], drugs_lib_train['se_pos'] = \
clean_text(drugs_lib_train['sideEffectsReview'])
drugs_lib_train['comments_clean'], drugs_lib_train['coments_tokens'], drugs_lib_train['c_pos'] = \
clean_text(drugs_lib_train['commentsReview'])

# testing set
drugs_lib_test['benefits_clean'], drugs_lib_test['benefits_tokens'], drugs_lib_test['b_pos'] = \
clean_text(drugs_lib_test['benefitsReview'])
drugs_lib_test['sideEffects_clean'], drugs_lib_test['sideEffects_tokens'], drugs_lib_test['se_pos'] = \
clean_text(drugs_lib_test['sideEffectsReview'])
drugs_lib_test['comments_clean'], drugs_lib_test['coments_tokens'], drugs_lib_test['c_pos'] = \
clean_text(drugs_lib_test['commentsReview'])

Wall time: 3min 23s


Create a single data set that combines the testing and training sets so that all the data can be explored at once in the EDA portion of this analysis.

In [21]:
df_lib_full = drugs_lib_train.append(drugs_lib_test)

## Save Clean Dataframes

Save the dataframes with pre-processed text columns for later usage.

In [22]:
with open('./Drugs_Com_Training_cleaned.pk', 'wb') as f:
    pickle.dump(df_com_train, f)

with open('./Drugs_Com_Testing_cleaned.pk', 'wb') as f:
    pickle.dump(df_com_test, f)
    
with open('./Drugs_Com_Full_cleaned.pk', 'wb') as f:
    pickle.dump(df_com_full, f)
    
with open('./Drugs_Lib_Training_cleaned.pk', 'wb') as f:
    pickle.dump(drugs_lib_train, f)
    
with open('./Drugs_Lib_Testing_cleaned.pk', 'wb') as f:
    pickle.dump(drugs_lib_test, f)
    
with open('./Drugs_Lib_Full_cleaned.pk', 'wb') as f:
    pickle.dump(df_lib_full, f)