In [1]:
import pandas as pd
import numpy as np
import scipy
import re
import string

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, auc, roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer 
from nltk.stem import PorterStemmer, LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import warnings
warnings.filterwarnings('ignore') 

from IPython.display import Image

%matplotlib inline

## Preprocessing

In [2]:
train = pd.read_csv('drugsComTrain_raw_csv.csv',index_col=0)
test = pd.read_csv('drugsComTest_raw_csv.csv',index_col=0)

### clean train data 

In [3]:
# Deal with the all '&#039;' with "'"
train['review'] = train['review'].replace("&#039;","'", regex=True)

In [4]:
# Deal with all date form
train['date'] = pd.to_datetime(train['date'])

In [5]:
# convert to lowercase
train['review'] = train['review'].str.lower()

In [6]:
# remove punctuation
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

In [7]:
train['review'] = train['review'].apply(punctuation_removal)
train['review'].head()

206461    it has no side effect i take it in combination...
95260     my son is halfway through his fourth week of i...
92703     i used to take another oral contraceptive whic...
138000    this is my first time using any form of birth ...
35696     suboxone has completely turned my life around ...
Name: review, dtype: object

In [8]:
# remove stopwords
stop = stopwords.words('english')
stop.append("i'm")

In [9]:
stop_words = []

for item in stop: 
    new_item = punctuation_removal(item)
    stop_words.append(new_item) 
print(stop_words[::12])

['i', 'youd', 'hers', 'which', 'were', 'a', 'at', 'above', 'again', 'both', 'own', 'dont', 'aren', 'haven', 'shant']


In [10]:
clothes_list =['white', 'black']

In [11]:
def stopwords_removal(messy_str):
    messy_str = word_tokenize(messy_str)
    return [word.lower() for word in messy_str 
            if word.lower() not in stop_words and word.lower() not in clothes_list ]

In [12]:
train['review'] = train['review'].apply(stopwords_removal)
train['review'].head()

206461    [side, effect, take, combination, bystolic, 5,...
95260     [son, halfway, fourth, week, intuniv, became, ...
92703     [used, take, another, oral, contraceptive, 21,...
138000    [first, time, using, form, birth, control, gla...
35696     [suboxone, completely, turned, life, around, f...
Name: review, dtype: object

In [13]:
# drop all numbers
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search('\d', i):
            list_text_new.append(i)
    return ' '.join(list_text_new)

In [14]:
train['review'] = train['review'].apply(drop_numbers)
train['review'].head()

206461    side effect take combination bystolic mg fish oil
95260     son halfway fourth week intuniv became concern...
92703     used take another oral contraceptive pill cycl...
138000    first time using form birth control glad went ...
35696     suboxone completely turned life around feel he...
Name: review, dtype: object

In [15]:
# Stemming
porter = PorterStemmer()

In [16]:
train['review'] = train['review'].apply(lambda x: x.split())
train['review'].head()

206461    [side, effect, take, combination, bystolic, mg...
95260     [son, halfway, fourth, week, intuniv, became, ...
92703     [used, take, another, oral, contraceptive, pil...
138000    [first, time, using, form, birth, control, gla...
35696     [suboxone, completely, turned, life, around, f...
Name: review, dtype: object

In [17]:
def stem_update(text_list):
    text_list_new = []
    for word in text_list:
        word = porter.stem(word)
        text_list_new.append(word) 
    return text_list_new

In [18]:
train['review'] = train['review'].apply(stem_update)
train['review'].head()

206461    [side, effect, take, combin, bystol, mg, fish,...
95260     [son, halfway, fourth, week, intuniv, becam, c...
92703     [use, take, anoth, oral, contracept, pill, cyc...
138000    [first, time, use, form, birth, control, glad,...
35696     [suboxon, complet, turn, life, around, feel, h...
Name: review, dtype: object

In [19]:
train['review'] = train['review'].apply(lambda x: ' '.join(x))
train['review'].head()

206461           side effect take combin bystol mg fish oil
95260     son halfway fourth week intuniv becam concern ...
92703     use take anoth oral contracept pill cycl happi...
138000    first time use form birth control glad went pa...
35696     suboxon complet turn life around feel healthie...
Name: review, dtype: object

In [20]:
Review = train['review']

In [21]:
def text_vectorizing_process(sentence_string):
    return [word for word in sentence_string.split()]

In [22]:
bow_transformer = CountVectorizer(text_vectorizing_process)

In [23]:
bow_transformer.fit(Review)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8',
                input=<function text_vectorizing_process at 0x13161d5f0>,
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [24]:
print(Review.iloc[3])

first time use form birth control glad went patch month first decreas libido subsid downsid made period longer day exact use period day max also made cramp intens first two day period never cramp use birth control happi patch


In [25]:
example = bow_transformer.transform([Review.iloc[3]])
print(example)

  (0, 1598)	1
  (0, 5198)	2
  (0, 9450)	2
  (0, 9883)	2
  (0, 10786)	3
  (0, 11152)	1
  (0, 13268)	1
  (0, 15778)	1
  (0, 17146)	3
  (0, 17843)	1
  (0, 18898)	1
  (0, 20043)	1
  (0, 23190)	1
  (0, 25668)	1
  (0, 26324)	1
  (0, 26980)	2
  (0, 27405)	1
  (0, 29382)	1
  (0, 30996)	1
  (0, 34361)	2
  (0, 34766)	3
  (0, 47022)	1
  (0, 49378)	1
  (0, 50900)	1
  (0, 51978)	3
  (0, 53762)	1


In [26]:
Reviews = bow_transformer.transform(Review)
Reviews

<161297x55614 sparse matrix of type '<class 'numpy.int64'>'
	with 5531689 stored elements in Compressed Sparse Row format>

In [27]:
tfidf_transformer = TfidfTransformer().fit(Reviews)
tfidf_example = tfidf_transformer.transform(example)
print (tfidf_example)

  (0, 53762)	0.10497278844639577
  (0, 51978)	0.2676318135249754
  (0, 50900)	0.1060469173751006
  (0, 49378)	0.08029284042662528
  (0, 47022)	0.17703273103954834
  (0, 34766)	0.28953355520592533
  (0, 34361)	0.339777219223567
  (0, 30996)	0.10406925095147522
  (0, 29382)	0.07425438245518536
  (0, 27405)	0.22472646289318932
  (0, 26980)	0.22761472631980223
  (0, 26324)	0.14248856329258303
  (0, 25668)	0.18999433434188243
  (0, 23190)	0.17551141101166107
  (0, 20043)	0.13502716915938182
  (0, 18898)	0.17211761676156068
  (0, 17843)	0.17419625104305278
  (0, 17146)	0.2486088570659288
  (0, 15778)	0.2291414769035613
  (0, 13268)	0.20032396820406848
  (0, 11152)	0.16318996534465663
  (0, 10786)	0.19951417762073076
  (0, 9883)	0.23057606550368587
  (0, 9450)	0.21397897667069118
  (0, 5198)	0.23631918035187263
  (0, 1598)	0.09645507099153036


In [28]:
messages_tfidf = tfidf_transformer.transform(Reviews)
messages_tfidf.shape

(161297, 55614)

In [29]:
messages_tfidf = messages_tfidf.toarray()
messages_tfidf = pd.DataFrame(messages_tfidf)
print(messages_tfidf.shape)
messages_tfidf.head()

(161297, 55614)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55604,55605,55606,55607,55608,55609,55610,55611,55612,55613
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_all = pd.merge(train.drop(columns='review'),messages_tfidf, 
                  left_index=True, right_index=True )
df_all.head()

### clean test dataset

In [None]:
# Deal with the all '&#039;' with "'"
test['review'] = test['review'].replace("&#039;","'", regex=True)

In [None]:
# Deal with all date form
test['date'] = pd.to_datetime(test['date'])

In [None]:
# convert to lowercase
test['review'] = test['review'].str.lower()

In [None]:
# remove punctuation
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

In [None]:
test['review'] = test['review'].apply(punctuation_removal)
test['review'].head()

In [None]:
# remove stop words
stop = stopwords.words('english')
stop.append("i'm")

In [None]:
stop_words = []

for item in stop: 
    new_item = punctuation_removal(item)
    stop_words.append(new_item) 
print(stop_words[::12])

In [None]:
additional_list =['white', 'black']

In [None]:
def stopwords_removal(messy_str):
    messy_str = word_tokenize(messy_str)
    return [word.lower() for word in messy_str 
            if word.lower() not in stop_words and word.lower() not in additional_list ]

In [None]:
test['review'] = test['review'].apply(stopwords_removal)
test['review'].head()

In [None]:
# drop all numbers
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search('\d', i):
            list_text_new.append(i)
    return ' '.join(list_text_new)

In [None]:
test['review'] = test['review'].apply(drop_numbers)
test['review'].head()

In [None]:
# Stemming
porter = PorterStemmer()

In [None]:
test['review'] = test['review'].apply(lambda x: x.split())
test['review'].head()

In [None]:
def stem_update(text_list):
    text_list_new = []
    for word in text_list:
        word = porter.stem(word)
        text_list_new.append(word) 
    return text_list_new

In [None]:
test['review'] = test['review'].apply(stem_update)
test['review'].head()

In [None]:
test['review'] = test['review'].apply(lambda x: ' '.join(x))
test['review'].head()

In [None]:
Review_test = test['review']

In [None]:
Review_test

In [None]:
def text_vectorizing_process(sentence_string):
    return [word for word in sentence_string.split()]

In [None]:
bow_transformer = CountVectorizer(text_vectorizing_process)

In [None]:
bow_transformer.fit(Review_test)

In [None]:
print(Review_test.iloc[3])

In [None]:
example = bow_transformer.transform([Review_test.iloc[3]])
print(example)

In [None]:
Reviews_test = bow_transformer.transform(Review_test)
Reviews_test

In [None]:
tfidf_transformer = TfidfTransformer().fit(Reviews_test)
tfidf_example = tfidf_transformer.transform(example)
print (tfidf_example)

In [None]:
messages_tfidf_test = tfidf_transformer.transform(Reviews_test)
messages_tfidf_test.shape

In [None]:
messages_tfidf_test = messages_tfidf_test.toarray()
messages_tfidf_test = pd.DataFrame(messages_tfidf_test)
print(messages_tfidf_test.shape)
messages_tfidf_test.head()

In [None]:
test_all = pd.merge(test.drop(columns='review'),messages_tfidf_test, 
                  left_index=True, right_index=True )
test_all.head()

In [None]:
test_all.to_csv('Output2.csv',index=False)

## Build Model

In [126]:
X_train = df_all.drop('score', axis=1)
y_train = df_all.score

In [129]:
svc_model = SVC(C=1.0, 
             kernel='linear',
             class_weight='balanced', 
             probability=True,
             random_state=111)
svc_model.fit(X, y)

ValueError: setting an array element with a sequence.