In [198]:
!pip install googletrans
!pip install tqdm --upgrade
!pip install twython

Requirement already up-to-date: tqdm in /usr/local/lib/python3.6/dist-packages (4.45.0)


In [0]:
# web scraping
import requests
from bs4 import BeautifulSoup
from time import time, sleep
from random import randint

# Translation
from googletrans import Translator

# Utilities
from tqdm import tqdm
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd

# NLP
from nltk import sent_tokenize, word_tokenize, pos_tag, RegexpParser
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import mark_negation

In [200]:
# Download resources
import nltk

nltk.download("punkt")
nltk.download("vader_lexicon")
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

Web Scrap

โรงพยาบาลกรุงเทพคริสเตียน

In [0]:
def honestdoc_comment(url, hospitalName):
    """
    This function is to scrap data from a webboard (https://www.honestdocs.com).

    INPUT
    url : String
      URL of the target website
    
    OUTPUT
    comment : List
      List of comments
    score : List
      List of rating score
    """
    #create connection
    data = requests.get(url)
    print("requests code : {}".format(data.status_code)) 
    print("note\n2xx: success\n4xx, 5xx: error")
    
    #scrape comment and score
    start_time = time() #start scraping data from page1
    r = requests.get(url, params=dict(query="web scraping",page=1)) 
    soup = BeautifulSoup(r.text,"html.parser")
    n = len(soup.find_all("div",{"class":"comments__content"})) #find n of items in the page
    
    #extract each item
    comment = [soup.find_all("div",
                             {"class":"comments__content"})[i].get_text().strip() for i in range(0,n)]
    score = [soup.find_all("span",
                           {"class":"stars star-rating"})[i].attrs["data-score"] for i in range(0,n)]
    elapsed_time = time() - start_time #finish scraping data from page1
    print("Time used for scraping data from page - 1 : {} s".format(elapsed_time))
    sleep(randint(1,3)) #mimic human behavior
           
    p = 2 #start scraping data from page2
    while n > 0: #until the number of items in a page = 0
        start_time = time() 
        r = requests.get(url, params=dict(query="web scraping",page=p))
        soup = BeautifulSoup(r.text,"html.parser")
        n = len(soup.find_all("div",{"class":"comments__content"}))
        [comment.append(soup.find_all("div",
                                      {"class":"comments__content"})[i].get_text().strip()) for i in range(0,n)]
        [score.append(soup.find_all("span",
                                    {"class":"stars star-rating"})[i].attrs["data-score"]) for i in range(0,n)]
        elapsed_time = time() - start_time
        print("Time used for scraping data from page - {} : {} s".format(p, elapsed_time))
        p +=1
        sleep(randint(1,3))

    df = pd.DataFrame({"comment": comment, 
              "score": score
              })
    
    df["hospital"] = hospitalName
    
    return df

คำสั่งแปลงจากภาษาไทยเป็นภาษาอังกฤษ

In [0]:
def th2en(comment):
  try:
    textTrans = Translator().translate(comment, src="th", dest="en").text
  except:
    textTrans = ""
    print("\n Can not translate " + comment)

  return textTrans




ดึง comment จากโรงพยาบาลพระราม9

In [203]:
df = honestdoc_comment(r"https://www.honestdocs.co/hospitals/bangkok-christian-hospital", "bangkok-christian-hospital")
comments = df

requests code : 200
note
2xx: success
4xx, 5xx: error
Time used for scraping data from page - 1 : 0.39247846603393555 s
Time used for scraping data from page - 2 : 0.45721936225891113 s
Time used for scraping data from page - 3 : 0.42501330375671387 s
Time used for scraping data from page - 4 : 0.45900630950927734 s
Time used for scraping data from page - 5 : 0.39813661575317383 s
Time used for scraping data from page - 6 : 0.4381897449493408 s
Time used for scraping data from page - 7 : 0.37100696563720703 s
Time used for scraping data from page - 8 : 0.43073320388793945 s
Time used for scraping data from page - 9 : 0.37108755111694336 s
Time used for scraping data from page - 10 : 0.37471652030944824 s
Time used for scraping data from page - 11 : 0.5524148941040039 s
Time used for scraping data from page - 12 : 0.3687174320220947 s
Time used for scraping data from page - 13 : 0.3921060562133789 s
Time used for scraping data from page - 14 : 0.39066481590270996 s
Time used for scrapin

นำคอมเม้นท์มาแปลงเป็นภาษาอังกฤษ

In [204]:
tqdm.pandas()
comments["en"] = comments.progress_apply(lambda x: th2en(x["comment"]), axis=1)

comments = df

100%|██████████| 80/80 [00:09<00:00,  8.72it/s]


ได้ทั้งหมด 80 แถว 4 column

In [205]:
comments.shape

(80, 4)

นับจำนวนคอมเม้นท์ได้ทั้งหมด 80 คอมเม้น

In [206]:
comments["hospital"].value_counts()

bangkok-christian-hospital    80
Name: hospital, dtype: int64

ตรวจสอบ 5 แถวแรกของข้อมูล

In [207]:
comments.head(10)

Unnamed: 0,comment,score,hospital,en
0,สิ่งที่ชอบอย่างหนึ่งมากจริงๆคือบุคลากรในโรงพยา...,5,bangkok-christian-hospital,What is really like a lot of people in this ho...
1,ได้เข้าไปใช้บริกาที่นี่ มีความสะอาดและต้อนรับด...,5,bangkok-christian-hospital,Riga has to use here. Clean and good reception...
2,สิ่งที่ชอบอย่างหนึ่งมากจริงๆคือบุคลากรในโรงพยา...,4,bangkok-christian-hospital,What is really like a lot of people in this ho...
3,พยาบาล คุณหมอพูดจา ดูแลดีค่ะ โรงพยาบาลมีที่จอด...,5,bangkok-christian-hospital,Hospital doctors speak good care hospital park...
4,ไปหาคุณหมอด้วยอาการปวดขาคุณหมอถามนิดหน่อยแล้วก...,3,bangkok-christian-hospital,You go to the doctor with symptoms of leg pain...
5,เป็นโรงพยาบาลที่ทุกคนในครอบครัวใช้บริการ เพราะ...,4,bangkok-christian-hospital,The hospital where the family service. Because...
6,ป่วยตกบัยฝนไดค่ะเข้ารักษาห้องฉุกเฉินมีประกันสุ...,4,bangkok-christian-hospital,Buffett's rain falls ill treat me into the eme...
7,เป็นคนไข้ประจำมาตลอด 20 กว่าปี ล่าสุดได้ไปหาหม...,2,bangkok-christian-hospital,Patients are routinely over the last 20 years ...
8,ไปรับการรักษาโรครูมาตอยด์ที่โรงพยาบาลกรุงเทพคร...,5,bangkok-christian-hospital,To treat rheumatoid at the Bangkok Christian H...
9,บริการดี อธิบายละเอียด ใส่ใจคนไข้,5,bangkok-christian-hospital,Good service described patient care


Mount Drive เพื่อAuthorize เข้าถึง Folder ในDrive

In [208]:
from google.colab import drive
drive.mount('/content/gdrive')

import os
os.chdir('/content/gdrive/My Drive/DS532_Text_Analytic/Project_Hospital')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Save ข้อมูลเป็น csv

In [0]:
#backup data 
comments.to_csv("comment_bangkok-christian-hospital.csv", index=False)

In [0]:
from tqdm import tqdm_notebook

**Read data**

In [0]:
rama = pd.read_csv("comment_bangkok-christian-hospital.csv")
rama = rama.reset_index()
rama = rama.rename(columns={"index": "cid"})

Sentence Tokenize

In [212]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
from nltk import sent_tokenize

sentences = []
cids = []
for cid in rama["cid"]:
    s = sent_tokenize(rama.loc[cid, "en"])
    sentences = sentences + s
    cids = cids + [cid] * len(s)

**Sentence-level sentiment**

In [214]:
!pip install twython



In [0]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [0]:
def predict_sentiment(sentence):
  sa = SentimentIntensityAnalyzer()
  sentiment_distribution = sa.polarity_scores(sentence)
  score = sentiment_distribution.get("compound")
  
  if score >= 0.05:
    return "positive"
  elif score <= -0.05:
    return "negative"
  else:
    return "neutral"

In [217]:
import nltk
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [218]:
sentiments = []
for sentence in tqdm_notebook(sentences, total=len(sentences)):
    sentiments.append(predict_sentiment(sentence))

HBox(children=(FloatProgress(value=0.0, max=315.0), HTML(value='')))




Parsing

In [219]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [0]:
from nltk import word_tokenize, pos_tag, RegexpParser

In [0]:
grammar = """NP: {<NN|NNS>+}
                 {<NN|NNS>+<CC><NN|NNS>+}"""
chunker = RegexpParser(grammar)
parse_trees = []
for sentence in sentences:
    parse_trees.append(chunker.parse(pos_tag(word_tokenize(sentence))))

In [222]:
parse_trees

[Tree('S', [('What', 'WP'), ('is', 'VBZ'), ('really', 'RB'), ('like', 'IN'), ('a', 'DT'), Tree('NP', [('lot', 'NN')]), ('of', 'IN'), Tree('NP', [('people', 'NNS')]), ('in', 'IN'), ('this', 'DT'), Tree('NP', [('hospital', 'NN')]), ('.', '.')]),
 Tree('S', [('Most', 'JJS'), ('smiling', 'JJ'), Tree('NP', [('Greetings', 'NNS'), ('care', 'NN'), ('patients', 'NNS')]), ('and', 'CC'), ('those', 'DT'), ('who', 'WP'), ('use', 'VBP'), ('the', 'DT'), Tree('NP', [('service', 'NN')]), ('were', 'VBD'), ('very', 'RB'), ('good', 'JJ'), ('.', '.')]),
 Tree('S', [Tree('NP', [('Talk', 'NN'), ('relax', 'NN')]), ('at', 'IN'), ('all', 'DT'), ('.', '.')]),
 Tree('S', [('Most', 'JJS'), Tree('NP', [('doctors', 'NNS')]), ('who', 'WP'), ('have', 'VBP'), ('had', 'VBD'), ('the', 'DT'), ('good-hearted', 'JJ'), Tree('NP', [('people', 'NNS')]), ('I', 'PRP'), ('have', 'VBP'), ('ever', 'RB'), ('come', 'VBN'), ('across', 'IN'), Tree('NP', [('someone', 'NN')]), ('who', 'WP'), ("'s", 'VBZ'), ('not', 'RB'), ('very', 'RB'), 

Extract Noun Phrase

In [0]:
def extract_np(parsed_tree):
    nps = []
    for subtree in parsed_tree.subtrees():
        if subtree.label() == 'NP':
            t = subtree
            t = " ".join(word for word, tag in t.leaves())
            nps.append(t)
    return nps

In [0]:
nps = []
for parse_tree in parse_trees:
    nps.append(extract_np(parse_tree))

In [225]:
nps

[['lot', 'people', 'hospital'],
 ['Greetings care patients', 'service'],
 ['Talk relax'],
 ['doctors', 'people', 'someone'],
 ['rooms', 'price'],
 [],
 ['reception', 'service', 'system'],
 ['nurse', 'doctor', 'facilities', 'hospital'],
 ['Doctors', 'hospital', 'thing', 'service'],
 ['service'],
 ['doctor', 'patient feels', 'thank'],
 ['lot', 'people', 'hospital'],
 ['Greetings care patients', 'service'],
 ['Talk relax'],
 ['doctors', 'people', 'someone'],
 ['rooms', 'price', 'midwife', 'nursing care'],
 ['price', 'treatment', 'bit'],
 [],
 [],
 ['Hospital doctors', 'care hospital parking lot', 'buildings', 'prices'],
 ['service', 'return service'],
 ['queues', 'mall adjacent', 'hospital'],
 ['time', 'test results', 'something', 'results'],
 ['eloquence', 'treatment', 'childhood'],
 ['hospital'],
 ['doctor', 'symptoms', 'leg pain', 'doctor', 'medication', 'line'],
 ['hospital', 'family service'],
 ['price', 'home care', 'ease', 'travel'],
 ['rain', 'emergency room', 'health insurance cl

**Put information into DataFrame**

In [0]:
df = pd.DataFrame({"cids": cids, 
                   "sentences": sentences, 
                   "sentiments": sentiments, 
                   "NP": nps})

In [227]:
df.head(10)

Unnamed: 0,cids,sentences,sentiments,NP
0,0,What is really like a lot of people in this ho...,positive,"[lot, people, hospital]"
1,0,Most smiling Greetings care patients and those...,positive,"[Greetings care patients, service]"
2,0,Talk relax at all.,positive,[Talk relax]
3,0,Most doctors who have had the good-hearted peo...,neutral,"[doctors, people, someone]"
4,0,The Department of Medicine Private rooms are O...,positive,"[rooms, price]"
5,1,Riga has to use here.,neutral,[]
6,1,Clean and good reception The service was with ...,positive,"[reception, service, system]"
7,1,A nurse and doctor There are facilities within...,neutral,"[nurse, doctor, facilities, hospital]"
8,1,Doctors are competent The hospital is a good t...,positive,"[Doctors, hospital, thing, service]"
9,1,I would recommend to use the service if they a...,negative,[service]


In [0]:
df['sentences'] = df['sentences'].str.replace("[^a-zA-Z#]", " ")

In [0]:
df['sentiments'] = df['sentiments'].str.replace("[^a-zA-Z#]", " ")

Split dataframe into positive and negative sentiment

In [230]:
pos = df[df["sentiments"] == "positive"].reset_index(drop=True)
pos_words = sum(pos["NP"].tolist(), [])
pos_words = [word.lower() for word in pos_words]
poshist = pd.DataFrame({"words": pos_words})
poshist = poshist.groupby("words").size().reset_index(name="#pos")

TypeError: ignored

In [231]:
neg = df[df["sentiments"] == "negative"].reset_index(drop=True)
neg_words = sum(neg["NP"].tolist(), [])
neg_words = [word.lower() for word in neg_words]
neghist = pd.DataFrame({"words": neg_words})
neghist = neghist.groupby("words").size().reset_index(name="#neg")

TypeError: ignored

**Word Cloud Representation**

In [0]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [0]:
def vizwordcloud(df):
    tuples = [tuple(x) for x in df.values]
    wordcloud = WordCloud().generate_from_frequencies(dict(tuples))
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [234]:
vizwordcloud(poshist)

ValueError: ignored

In [0]:
vizwordcloud(neghist)

We saw 2 problems: 1) Singular and plural words were not combined and 2) words that should not be appeared in the wordcloud such as patients because of the co-reference resolution problem.

Let's transfrom selected words to their singular form because at this stage the number of words is not much already then add categories to filter out.

In [0]:
def plural2singular(word):
  if word == "doctors":
    return "doctor"
  elif word == "nurses":
    return "nurse"
  elif word == "clinics":
    return "clinic"
  elif word == "hospitals":
    return "hospital"
  elif word == "services":
    return "service"
  elif word == "staffs":
    return "staff"
  elif word == "treatments":
    return "treatment"
  elif word == "students":
    return "student"
  else:
    return word

In [0]:
pos_words = [plural2singular(word) for word in pos_words]
poshist = pd.DataFrame({"words": pos_words})
poshist = poshist.groupby("words").size().reset_index(name="#pos")

neg_words = [plural2singular(word) for word in neg_words]
neghist = pd.DataFrame({"words": neg_words})
neghist = neghist.groupby("words").size().reset_index(name="#neg")

In [0]:
vizwordcloud(poshist)

In [0]:
vizwordcloud(neghist)

In [0]:
def groups(word):
    if word in ("doctor", "nurse", "student"):
        return "staff"
    elif word in ("hospital", "clinic", "equipment", "place"):
        return "infrastructure"
    elif word in ("service", "care", "treatment", "surgery", "disease", "medical care"):
        return "service"
    elif word in ("queue", "appointment"):
        return "process"
    elif word in ("price", "claim"):
        return "finance"
    else:
        return "others"

In [0]:
poshist["category"] = poshist["words"].apply(lambda x: groups(x))
poscount = poshist.groupby("category").size().reset_index(name="#")
poscount = poscount[poscount["category"] != "others"].reset_index(drop=True)

In [0]:
neghist["category"] = neghist["words"].apply(lambda x: groups(x))
negcount = neghist.groupby("category").size().reset_index(name="#")
negcount = negcount[negcount["category"] != "others"].reset_index(drop=True)

**Plot Radar Chart**

In [0]:
poscount

In [0]:
negcount

In [0]:
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode,  plot

In [0]:
fig = px.line_polar(poscount, r="#", theta="category", line_close=True)
plot(fig)

In [0]:
fig_2 = px.line_polar(negcount, r="#", theta="category", line_close=True)
plot(fig_2)

Extracting Features from cleaned comments

In [0]:
from wordcloud import WordCloud,ImageColorGenerator
from PIL import Image
import urllib
import requests

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')

# bag-of-words feature matrix
bow = bow_vectorizer.fit_transform(df['sentences'])

df_bow = pd.DataFrame(bow.todense())

df_bow

In [0]:
#Term Frequency-Inverse Document Frequency (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(max_df=0.90, min_df=2,max_features=1000,stop_words='english')

tfidf_matrix=tfidf.fit_transform(df['sentences'])

df_tfidf = pd.DataFrame(tfidf_matrix.todense())

df_tfidf

#Splitting our dataset into Training and Validation Set

In [0]:
train_bow = bow[:31962]
train_bow.todense()

In [0]:
#Using features from TF-IDF for training set
train_tfidf_matrix = tfidf_matrix[:31962]

train_tfidf_matrix.todense()

In [0]:
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder ()
df['sentiments'] = number.fit_transform(df['sentiments'].astype('str'))
df

In [0]:
# Neutral label : 0 , Negative label : 1 , Positive  label : 2
import seaborn as sns
sns.countplot(df['sentiments'])
sns.despine()

In [0]:
#Splitting the data into training and validation set
from sklearn.model_selection import train_test_split

In [0]:
x_train_bow, x_valid_bow, y_train_bow, y_valid_bow = train_test_split(train_bow,df['sentiments'],test_size=0.3,random_state=0)

In [0]:
x_train_tfidf, x_valid_tfidf, y_train_tfidf, y_valid_tfidf = train_test_split(train_tfidf_matrix,df['sentiments'],test_size=0.3,random_state=0)

In [0]:
x_train_bow.shape

In [0]:
x_valid_bow.shape

In [0]:
y_train_bow.shape

In [0]:
y_valid_bow.shape

#Applying Machine Learning Models

In [0]:
from sklearn.metrics import f1_score

In [0]:
#Neural Network
from sklearn.neural_network import MLPClassifier
class_NN = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu',solver='adam', max_iter=500)

In [0]:
class_NN.fit(x_train_bow,y_train_bow)

In [0]:
prediction_bow = class_NN.predict_proba(x_valid_bow)

prediction_bow

In [0]:
#Calculating the F1 score
# if prediction is equal to 2 else 1 where 2 is for positive comments and 1 for negative sentiment comments
prediction_int = prediction_bow[:,1]>=0.3

# converting the results to integer type
prediction_int = prediction_int.astype(np.int)
prediction_int

# calculating f1 score
log_bow = f1_score(y_valid_bow, prediction_int,average='micro')

log_bow

In [0]:
class_NN.fit(x_train_tfidf,y_train_tfidf)

In [0]:
prediction_tfidf = class_NN.predict_proba(x_valid_tfidf)

prediction_tfidf

In [0]:
prediction_int_tfidf = prediction_tfidf[:,1]>=0.3

prediction_int_tfidf = prediction_int.astype(np.int)
prediction_int_tfidf

# calculating f1 score
log_tfidf = f1_score(y_valid_tfidf, prediction_int_tfidf,average='micro')

log_tfidf

In [0]:
#XGBoost
from xgboost import XGBClassifier
model_bow = XGBClassifier(random_state=0,learning_rate=0.9)
model_bow.fit(x_train_bow, y_train_bow)
xgb = model_bow.predict_proba(x_valid_bow)

xgb

In [0]:
xgb = xgb[:,1] >= 0.3

# converting the results to integer type
xgb_int=xgb.astype(np.int)

# calculating f1 score
xgb_bow=f1_score(y_valid_bow,xgb_int,average='micro')

xgb_bow

In [0]:
model_tfidf = XGBClassifier(random_state=2,learning_rate=0.7)
model_tfidf.fit(x_train_tfidf, y_train_tfidf)
xgb_tfidf=model_tfidf.predict_proba(x_valid_tfidf)

xgb_tfidf

In [0]:
xgb_tfidf=xgb_tfidf[:,1]>=0.3

# converting the results to integer type
xgb_int_tfidf=xgb_tfidf.astype(np.int)

# calculating f1 score
score=f1_score(y_valid_tfidf,xgb_int_tfidf,average='micro')

score

In [0]:
#support vector machine
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn.svm import LinearSVC, SVC
from numpy import *
svm = SVC(gamma='auto')
model_svc = LinearSVC()
svm = CalibratedClassifierCV(model_svc) 
svm.fit(x_train_bow,y_train_bow)

In [0]:
# แก้ปัญฆา predict_proba ไม่ได้ เนื่องจาก probability=False
import pickle
filename = 'linearSVC.sav'
pickle.dump(model, open(filename, 'wb'))

In [0]:
svm = pickle.load(open(filename, 'rb'))
svm_bow = svm.predict_proba(x_valid_bow)

svm_bow

In [0]:
svm_bow=svm_bow[:,1]>=0.3
# converting the results to integer type
svm_int_bow=svm_bow.astype(np.int)
# calculating f1 score
svm_score_bow=f1_score(y_valid_bow,svm_int_bow,average='micro')

svm_score_bow

In [0]:
svm.fit(x_train_tfidf,y_train_tfidf)
svm_tfidf = svm.predict_proba(x_valid_tfidf)

svm_tfidf

In [0]:
svm_tfidf=svm_tfidf[:,1]>=0.3

# converting the results to integer type
svm_int_tfidf=svm_tfidf.astype(np.int)

# calculating f1 score
svm_score_tfidf=f1_score(y_valid_tfidf,svm_int_tfidf,average='micro')

svm_score_tfidf

**Model Comparison**

In [0]:
# Bag-of-Words
Algo_1 = ['NeuralNetwork(Bag-of-Words)','XGBoost(Bag-of-Words)','support vector machine(Bag-of-Words)']

score_1 = [log_bow,xgb_bow,svm_score_bow]

compare_1 = pd.DataFrame({'Model':Algo_1,'F1_Score':score_1},index=[i for i in range(1,4)])

compare_1.T

In [0]:
# TF-IDF
Algo_2 = ['NeuralNetwork(TF-IDF)','XGBoost(TF-IDF)','support vector machine(TF-IDF)']

score_2 = [log_tfidf,score,svm_score_tfidf]

compare_2 = pd.DataFrame({'Model':Algo_2,'F1_Score':score_2},index=[i for i in range(1,4)])

compare_2.T

Evaluating the model

In [0]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [0]:
print (y_valid_bow)

In [0]:
print (y_valid_tfidf)

Comparison Graph

In [0]:
# Bag-of-Words
import seaborn as sns
plt.figure(figsize=(18,5))

sns.pointplot(x='Model',y='F1_Score',data=compare_1)

plt.title('Bag-of-Words')
plt.xlabel('MODEL')
plt.ylabel('SCORE')

plt.show()

In [0]:
# TF-IDF
plt.figure(figsize=(18,5))

sns.pointplot(x='Model',y='F1_Score',data=compare_2)

plt.title('TF-IDF')
plt.xlabel('MODEL')
plt.ylabel('SCORE')

plt.show()

Playing around with auto-sklearn

In [0]:
!apt-get install swig -y
!pip install Cython numpy

# sometimes you have to run the next command twice on colab
# I haven't figured out why
!pip install auto-sklearn
# ignore some annoying warnings for demonstrating auto-sklearn 
# shouldn't be done in real production
import numpy as np
np.warnings.filterwarnings('ignore')

In [0]:
import autosklearn.classification
import sklearn.model_selection
import sklearn.datasets
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [0]:
#bag-of-words
x_train_bow, x_valid_bow, y_train_bow, y_valid_bow = sklearn.model_selection.train_test_split(df_bow,df['sentiments'],test_size=0.3,random_state=0)

In [0]:
#tfidf
x_train_tfidf, x_valid_tfidf, y_train_tfidf, y_valid_tfidf = sklearn.model_selection.train_test_split(df_tfidf,df['sentiments'],test_size=0.3,random_state=0)

In [0]:
import autosklearn.classification
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

x_train_bow, x_valid_bow, y_train_bow, y_valid_bow = \
        sklearn.model_selection.train_test_split(df_bow,df['sentiments'], random_state=1)
automl = autosklearn.classification.AutoSklearnClassifier()
automl.fit(x_train_bow, y_train_bow)
y_hat = automl.predict(x_valid_bow)
print("Accuracy score", sklearn.metrics.accuracy_score(y_valid_bow, y_hat))
print("", sklearn.metrics.confusion_matrix(y_valid_bow, y_hat))
print("", sklearn.metrics.classification_report(y_valid_bow, y_hat))