In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

import pickle
import os

import progressbar
from sys import getsizeof

# Drugs Reviews Dataset

## Download

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip

In [None]:
!unzip drugsCom_raw.zip

## Load

In [None]:
def read_data():
    data_train = pd.read_csv('drugsComTrain_raw.tsv', sep='\t')
    data_test = pd.read_csv('drugsComTest_raw.tsv', sep='\t')

    return data_train, data_test

data_train,data_test=read_data()

# Review to Words

In [None]:
def review_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    review=re.sub(r"&#039;","'",review) #this particular string was used to replace "'"
    review = re.sub(r"[^a-zA-Z0-9]", " ", review.lower()) # Convert to lower case
    words = review.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

# Preprocess data

In [None]:
cache_dir = os.path.join("./cache", "drugreview_analysis")  # where to store cache files
os.makedirs(cache_dir)  # ensure cache directory exists

In [None]:
def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        #words_train = list(map(review_to_words, data_train))
        #words_test = list(map(review_to_words, data_test))
        print('Training data:')
        words_train=[]
        for review in progressbar.progressbar(data_train):
            words_train.append(review_to_words(review))
        
        print('Test data:')
        words_test=[]
        for review in progressbar.progressbar(data_test):
            words_test.append(review_to_words(review))
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [None]:
train_X, test_X, train_y, test_y = preprocess_data(data_train['review'], data_test['review'], data_train['condition'], data_test['condition'])

# Dictionary

In [None]:
from collections import Counter

In [None]:
def build_dict(data, vocab_size = 5000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    
    # TODO: Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a
    #       sentence is a list of words.
    count=Counter()
    for sentence in data:
        count.update(sentence)
    
    # TODO: Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and
    #       sorted_words[-1] is the least frequently appearing word.
    
    sorted_words = count.most_common()#this method sorts the words from the most frequent to the less frequent
    sorted_words=[word for word,_ in sorted_words]
    
    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

In [None]:
word_dict=build_dict(train_X_clean)

# Train Test dataframes

In [None]:
def df_train_test():
    train_df = data_train[['condition', 'review']]
    test_df = data_test[['condition', 'review']]
    train_df = train_df.dropna()
    test_df = test_df.dropna()

    def resub(review):
        review = re.sub(r"&#039;", "'", review)
        return review

    train_df.review = train_df.review.apply(resub)
    test_df.review = test_df.review.apply(resub)

    train_df = train_df[~train_df.condition.str.contains('</span>')]
    test_df = test_df[~test_df.condition.str.contains('</span>')]

    return train_df, test_df

In [None]:
train_df,test_df=df_train_test()

# Reduce number of classification items

In [None]:
def reduce_conditions(value):
    cond = train_df.condition.value_counts() > value

    def g(condition):
        if cond[condition]:
            return condition
        else:
            return 'other'

    train_df['condcopy'] = train_df['condition'].apply(g)

    s = set(train_df['condcopy'])
    in_s = test_df['condition'].isin(s)
    test_df['condcopy'] = test_df['condition']
    test_df['condcopy'][~in_s] = 'other'

    len_train = len(set(train_df.condcopy))
    len_test = len(set(test_df.condcopy))

    other_train = train_df.condcopy.value_counts()['other'] / train_df.shape[0] * 100
    other_test = test_df.condcopy.value_counts()['other'] / test_df.shape[0] * 100
    print('Nr conditions Train: ', len_train, '\nNr conditions Test: ', len_test)
    print('Percentage "other", Train: ', other_train, '%')
    print('Percentate "other", Test: ', other_test, '%')

In [None]:
reduce_conditions(50)

## Use CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
porter_stemmer = PorterStemmer()

In [None]:
nltk.download('stopwords')

In [None]:
def tknizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [None]:
def cv(max_features=5000,stop_words=stopwords.words("english"),tokenizer=tknizer):
    try:
        cv_train=pickle.load(open('cv_train.pkl','rb'))
        print('cv_train loaded.')
        cv_test=pickle.load(open('cv_test.pkl','rb'))
        print('cv_test loaded.')
    except:
        cv=CountVectorizer(max_features=max_features,stop_words=stop_words,tokenizer=tokenizer)
        cv_train=cv.fit_transform(train_df.review)
        pickle.dump(cv_train,open('cv_train.pkl','wb'))
        print('cv_train saved')
        cv_test=cv.transform(test_df.review)
        pickle.dump(cv_test,open('cv_test.pkl','wb'))
        print('cv_test saved')

    return cv_train,cv_test

In [None]:
cv_train,cv_test=cv(10000)

# Models

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [None]:
def acc(model, X_train, X_test, y_train, y_test):
    preds_train = model.predict(X_train)
    acc_train = accuracy_score(preds_train, y_train)
    print('accuracy train done.')

    preds_test = model.predict(X_test)
    acc_test = accuracy_score(preds_test, y_test)
    print('accuracy test done.')

    print('Train error: ', acc_train, '\nTest error: ', acc_test)

### NB model

In [None]:
model=MultinomialNB(alpha=1.0)

In [None]:
model.fit(cv_train,train_df.condition)

In [None]:
prediction=model.predict(cv_train)

# Random Forest Classifier

In [None]:
RFmodel=RandomForestClassifier(n_estimators=80,random_state=100,verbose=1,n_jobs=-1)

In [None]:
RFmodel.fit(cv_train,train_df.condcopy)

In [None]:
RF_predict=RFmodel.predict(cv_train)

## XGboost

In [None]:
params={'n_estimators':100,'max_depth':10, 'learning_rate':0.1, 'objective':'multi:softmax' ,'verbosity':1,'n_jobs':-1}

In [None]:
model=xgb.XGBClassifier(**params)

In [None]:
model.get_params();

In [None]:
model.fit(cv_train,train_df.condcopy)