In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

import pickle
import os

import progressbar
from sys import getsizeof

## Drugs Reviews Dataset

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip

In [None]:
!unzip drugsCom_raw.zip

In [None]:
data_train=pd.read_csv('./drugsComTrain_raw.tsv',sep='\t')

In [None]:
data_test=pd.read_csv('./drugsComTest_raw.tsv',sep='\t')

In [None]:
data_train.head()

In [None]:
data_train.info()

In [None]:
data_train.isna().sum()

In [None]:
data_test.isna().sum()

In [None]:
def loaddata():
    data_train=pd.read_csv('./drugsComTrain_raw.tsv',sep='\t')
    data_test=pd.read_csv('./drugsComTest_raw.tsv',sep='\t')
    
    return data_train,data_test

In [None]:
data_train,data_test=loaddata()

## Condition

In [None]:
data_train['condition'].value_counts()

In [None]:
conditions_train=data_train.condition

In [None]:
conditions_train=set(conditions_train)

In [None]:
conditions_test=data_test.condition

In [None]:
conditions_test=set(conditions_test)

In [None]:
len(conditions_train.intersection(conditions_test))

In [None]:
len(conditions_test)

In [None]:
cond_not_in_train=conditions_test-conditions_train

In [None]:
data_test[data_test['condition'].isin(cond_not_in_train)].condition.value_counts()

In [None]:
data_test.condition.value_counts().nlargest(15)

In [None]:
data_train.condition.value_counts().nlargest(15)

In [None]:
def review_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    review=re.sub(r"&#039;","'",review) #this particular string was used to replace "'"
    review = re.sub(r"[^a-zA-Z0-9]", " ", review.lower()) # Convert to lower case
    words = review.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

### preprocess data

In [None]:
cache_dir = os.path.join("./cache", "drugreview_analysis")  # where to store cache files
os.makedirs(cache_dir)  # ensure cache directory exists

In [None]:
def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        #words_train = list(map(review_to_words, data_train))
        #words_test = list(map(review_to_words, data_test))
        print('Training data:')
        words_train=[]
        for review in progressbar.progressbar(data_train):
            words_train.append(review_to_words(review))
        
        print('Test data:')
        words_test=[]
        for review in progressbar.progressbar(data_test):
            words_test.append(review_to_words(review))
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [None]:
ls cache/drugreview_analysis/

In [None]:
train_X, test_X, train_y, test_y = preprocess_data(data_train['review'], data_test['review'], data_train['condition'], data_test['condition'])

In [None]:
ls cache/drugreview_analysis/

In [None]:
train_X[0:2]

### Remove Nan entries

In [None]:
from itertools import compress

In [None]:
train_X_clean=list(compress(train_X,~train_y.isna()))

In [None]:
len(train_X_clean)

In [None]:
(~train_y.isna()).sum()

In [None]:
train_y_clean=train_y[~train_y.isna()]

In [None]:
len(train_y_clean)

### Remove erroneous condition

In [None]:
train_y_clean.str.contains('</span>').sum()

In [None]:
nospan=~train_y_clean.str.contains('</span>')

In [None]:
train_y_clean=train_y_clean[nospan]

In [None]:
len(train_y_clean)

In [None]:
train_X_clean=list(compress(train_X_clean,nospan))

In [None]:
len(train_X_clean)

## Dictionary

In [None]:
from collections import Counter

In [None]:
def build_dict(data, vocab_size = 5000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    
    # TODO: Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a
    #       sentence is a list of words.
    count=Counter()
    for sentence in data:
        count.update(sentence)
    
    # TODO: Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and
    #       sorted_words[-1] is the least frequently appearing word.
    
    sorted_words = count.most_common()#this method sorts the words from the most frequent to the less frequent
    sorted_words=[word for word,_ in sorted_words]
    
    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

In [None]:
word_dict=build_dict(train_X_clean)

In [None]:
drugs_data['review'][2]

In [None]:
review_to_words("It that")

In [None]:
token='&#039;'

In [None]:
rgx=re.compile(token)
lst=[]
for review in drugs_data['review']:
    if len(rgx.findall(review))!=0:
        lst.append(review)
    

In [None]:
drugs_data['review'].index

In [None]:
lst;

In [None]:
word_dict;

In [None]:
drugs_data.isna().sum()

## Use CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


#### get rid of nan rows

In [None]:
def cleandata(data_train,data_test):
    train_df=data_train[['condition','review']]
    test_df=data_test[['condition','review']]
    
    train_df=train_df.dropna()
    test_df=test_df.dropna()
    
    def resub(review):
        review=re.sub(r"&#039;","'",review)
        return review
    
    train_df.review=train_df.review.apply(resub)
    test_df.review=test_df.review.apply(resub)
    
    train_df=train_df[~train_df.condition.str.contains('</span>')]
    test_df=test_df[~test_df.condition.str.contains('</span>')]
    
    return train_df, test_df

In [None]:
train_df, test_df=cleandata(data_train,data_test)

In [None]:
train_df=data_train[['condition','review']]

In [None]:
test_df=data_test[['condition','review']]

In [None]:
train_df.isna().sum()

In [None]:
train_df=train_df.dropna()

In [None]:
train_df.isna().sum()

In [None]:
test_df=test_df.dropna()

In [None]:
print(data_train['review'][2])

Change '"& # 0 3 9 ; "' to "'"

In [None]:
def resub(review):
    review=re.sub(r"&#039;","'",review)
    return review

In [None]:
a=" &#039; sd&#039;sdf"

In [None]:
resub(a)

In [None]:
train_df.review=train_df.review.apply(resub)

In [None]:
print(train_df.review[2])

In [None]:
data_train.review[2]

In [None]:
test_df.review=test_df.review.apply(resub)

In [None]:
test_df.review[2]

Remove wrong condition (they contain '<span ..')

In [None]:
train_df.shape

In [None]:
train_df=train_df[~train_df.condition.str.contains('</span>')]

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
test_df.isna().sum()

In [None]:
test_df=test_df[~test_df.condition.str.contains('</span>')]

In [None]:
test_df.shape

Remove certain conditions (less frequent)

In [None]:
len(set(train_df.condition))

In [None]:
biggerthousand=train_df.condition.value_counts()>50

In [None]:
def g(condition):
    if ~biggerthousand[condition]:
        return 'Other'
    else:
        return condition

In [None]:
g('Cachexia')

In [None]:
train_df['condcopy']=train_df['condition'].apply(g)

In [None]:
len(set(train_df['condcopy']))

In [None]:
train_df.condcopy.value_counts()

Transform Test data

In [None]:
S=set(train_df.condcopy)
def trfTest(test_df):
    test_df[~test_df['condition'].isin(S)]='Other'

In [None]:
trfTest(test_df)

In [None]:
test_df.condition.value_counts()

### CountVectorizer

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

In [None]:
cv=CountVectorizer(stop_words=stopwords.words("english"))

In [None]:
cv_train=cv.fit_transform(train_df.review)

Use Tf-Idf

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
cv=TfidfVectorizer(max_features=5000,stop_words=stopwords.words("english"))

In [None]:
cv_train=cv.fit_transform(train_df.review)

In [None]:
cv.get_params();

In [None]:
len(cv.vocabulary_)

In [None]:
train_array=cv_train.toarray()

In [None]:
from sys import getsizeof

In [None]:
getsizeof(train_array)

### NB model

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
model=MultinomialNB(alpha=1.0)

In [None]:
model.fit(cv_train.toarray(),train_df.condition)

In [None]:
prediction=model.predict(train_array)

In [None]:
prediction[:3]

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
acc=accuracy_score(train_df.condition,prediction)

In [None]:
acc

In [None]:
len(train_df.condition.unique())

### Test error

In [None]:
cv_test=cv.transform(test_df.review)

In [None]:
test_array=cv_test.toarray()

In [None]:
test_predict=model.predict(test_array)

In [None]:
test_acc=accuracy_score(test_df.condition,test_predict)

In [None]:
test_acc

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RFmodel=RandomForestClassifier(n_estimators=80,random_state=100,verbose=1,n_jobs=-1)

In [None]:
RFmodel.fit(train_array,train_df.condcopy)

In [None]:
RF_predict=RFmodel.predict(train_array)

In [None]:
getsizeof(RF_predict)

In [None]:
RF_acc=accuracy_score(train_df.condition,RF_predict)

In [None]:
RF_acc

In [None]:
RF_predict[:10]

In [None]:
testRF_predict=RFmodel.predict(test_array)

In [None]:
test_acc_RF=accuracy_score(test_df.condition,testRF_predict)

In [None]:
test_acc_RF

In [None]:
getsizeof(testRF_predict)

In [None]:
%xdel test_array

In [None]:
%xdel RF_predict

In [None]:
%xdel train_array

## XGboost

In [None]:
import xgboost as xgb

In [None]:
params={'n_estimators':10,'max_depth':2, 'eta':1, 'objective':'multi:softmax' ,'verbosity':1}

In [None]:
model=xgb.XGBClassifier(**params)

In [None]:
model.get_params()

In [None]:
model.fit(train_array,train_df.condcopy)