In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.proportion import proportions_chisquare
from scipy.stats import chisquare
import pickle
from bs4 import BeautifulSoup
from collections import defaultdict
import requests
%matplotlib inline

import nltk
from nltk.tokenize import word_tokenize
import string
from nltk.stem.snowball import SnowballStemmer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [38]:
## only need to remove punctuation and stemize
stemmer = SnowballStemmer('english')

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [20]:
df = pd.read_csv("web_scrap.csv",index_col=0)
df.columns = ['first','Overall rating','Effectiveness','Side effects','Condition','Dosage','Other conditions','Other drugs taken','Benefits','Detailed Side effects','Comments']
tem = df['first'].str.split(" ",expand=True)[[0,4,7]]
tem.columns = ['name','age','gender']
df = pd.concat([tem,df],1).drop('first',1)

In [26]:
df['Side effects'].unique()

array([' Severe Side Effects', ' Moderate Side Effects',
       ' No Side Effects', ' Extremely Severe Side Effects',
       ' Mild Side Effects'], dtype=object)

In [39]:
train, test = train_test_split(df, test_size=0.25)

In [47]:
con_vec = TfidfVectorizer(stop_words='english',tokenizer=tokenize)
X_train = con_vec.fit_transform(train['Comments'])
y_train = train['Side effects']

X_test = con_vec.transform(test['Comments'])
y_test = test['Side effects']

  'stop_words.' % sorted(inconsistent))


In [52]:
lr = LogisticRegression(penalty='l2')
lr_cv_score = cross_val_score(lr,X_train,y_train,scoring='accuracy',cv=3,n_jobs=-1)
lr_cv_score

array([0.50125565, 0.49472097, 0.51609658])

In [55]:
svm_lin = SVC(kernel='linear')
svm_lin_cv_score = cross_val_score(svm_lin,X_train,y_train,scoring='accuracy',cv=3,n_jobs=-1)
svm_lin_cv_score

array([0.53942742, 0.53645048, 0.54778672])

In [58]:
svm_lin = SVC(kernel='rbf')
svm_lin_cv_score = cross_val_score(svm_lin,X_train,y_train,scoring='accuracy',cv=3,n_jobs=-1)
svm_lin_cv_score

array([0.32446007, 0.32478632, 0.32444668])

In [64]:
rfc = RandomForestClassifier(n_estimators=600,n_jobs=-1)
rfc_cv_score = cross_val_score(rfc,X_train,y_train,scoring='accuracy',cv=3,n_jobs=-1)
rfc_cv_score

array([0.64791562, 0.66214178, 0.67102616])

In [67]:
gbc = GradientBoostingClassifier(n_estimators=100)
gbc_cv_score = cross_val_score(gbc,X_train,y_train,scoring='accuracy',cv=3,n_jobs=-1)
gbc_cv_score

array([0.47815168, 0.47561589, 0.48390342])