# This Notebook is to scrape anonymous reviews from students.


>Edited from 190111 till 190117

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import time
import sqlite3
import pickle
import matplotlib
import matplotlib.pyplot as plt
import sys
from requests.exceptions import ConnectionError

pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 100
matplotlib.rcParams['font.family'] = 'AppleGothic'

%matplotlib inline

# Load in URL list into "urls" from "teachers_urls.txt"

In [2]:
f = open('teacher_urls.txt', 'rb')
urls = pickle.load(f)
f.close()

In [3]:
urls[:5]

['http://eikaiwa.dmm.com/teacher/index/398/',
 'http://eikaiwa.dmm.com/teacher/index/1117/',
 'http://eikaiwa.dmm.com/teacher/index/1446/',
 'http://eikaiwa.dmm.com/teacher/index/1677/',
 'http://eikaiwa.dmm.com/teacher/index/1897/']

# Scrape reviews of each teachers from their students and save them in the database file "ratings.db"

### commentごとのratingは、おそらくある時点以降に導入されたものと思われる。そのため過去のコメントではrating = 0が並んでいる。これらは後ほどNaNに置き換える予定。

In [7]:
def get_review(teacher_id, page):

    r =  requests.post("https://eikaiwa.dmm.com/teacher/tab_ajax_ratecomment/",
        data='teacher_id={}&page={}'.format(teacher_id, page),
        headers={
            "accept": "*/*",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "en-US,en;q=0.9",
            "authority": "eikaiwa.dmm.com",
            "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
            "origin": "https://eikaiwa.dmm.com",
            "referer": "https://eikaiwa.dmm.com/teacher/index/{}/".format(teacher_id),
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
            "x-requested-with": "XMLHttpRequest"
        },
        cookies={
            "AMP_TOKEN": "%24NOT_FOUND",
            "PHPSESSID": "d3b63b845f04033e8ba42225de244a88",
            "__utma": "225231877.1937935001.1545231753.1546318067.1547193623.3",
            "__utmb": "225231877.2.10.1547193623",
            "__utmc": "225231877",
            "__utmt": "1",
            "__utmz": "225231877.1547193623.3.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)",
            "_dc_gtm_UA-48257133-1": "1",
            "_dga": "GA1.3.1937935001.1545231753",
            "_dga_gid": "GA1.3.1694366917.1547193623",
            "_fbp": "fb.1.1547193626615.72004840",
            "_gali": "ratecomment",
            "app_uid": "ygb3J1waXYgdkRfaDtkSAg==",
            "cX_G": "cx%3A3kv82phj0hx3mw4ofrjh1f8ur%3A3cguwtwuxac6i",
            "cX_P": "jpvb385rfrdh4crx",
            "cX_S": "jqrr4xovsbqzvxt7",
            "i3_ab": "7460"
        },
    )

    return r.text

In [8]:
# get the score of the comment form a student
# the score is represented by yellow or grey star images.

def get_rating_from_html(bs):
    
    rating = 0
        
    for i in range(5):     
        if bs.find_all("img")[i]["src"][77:-4] == "yellow":
            rating += 1
        elif bs.find_all("img")[i]["src"][77:-4] == "grey":
            pass
        else:
            print("Something is wrong about get_rating_from_html()")
            sys.exit()
    
    return rating

In [9]:
def convert_year_month_day(date):
    digit_date = date = date.replace("年",",").replace("月",",").replace("日","").split(",")
    digit_date = date[0] + date[1].zfill(2) + date[2].zfill(2)
    return digit_date
    

In [102]:
### Comments would be stored in the database file once per url

def scrape_and_store_reviews_to_db(url_list, first_index, last_index):
    
    ids_without_reviews = []
    
    # Dataframe where all comments for this teacher would be stored
    df_reviews = pd.DataFrame(columns=["teacher_id", "name","date","rating","review"])
    
    t = time.time()
    
    for i, each_teacher_url in enumerate(url_list[ first_index:last_index ]):
    
        # print out the progress
        #if i % 30 == 0 : print(i)
            
        try:
            # extract this teacher's id from url
            teacher_id = each_teacher_url[37:-1]

            # get the comment counts portion like 88 in "コメント 全88件"
            # and get how many rating comments have been posted on this teacher
            bs = BeautifulSoup( get_review(teacher_id, 2) )
            num_comments = int(bs.find("p",{"class","total_comments"}).getText()[6:-1])
            
            # skip scraping if this teacher has no review like "コメント 全0件"
            if num_comments < 1:
                #print(i, "reviews not found", each_teacher_url)
                ids_without_reviews.append(teacher_id)
                
            else:
                # calcaulate how many pages I have to scrape like "18 pages"
                all_pages = (num_comments+4) // 5
                #print(all_pages, num_comments)

                review_count = 0

                # loop through each of pages
                for page in range(1, all_pages+1):

                    bs = BeautifulSoup( get_review(teacher_id, page) )

                    for j in range(5):
                        try:
                            # Series where each of comments would be scored.
                            each_review = pd.Series(index=["teacher_id", "name","date","rating","review"], )
                            review_block = bs.find_all("dl",{"class","comments"})[j]

                            # store review information in the Series "each_review"
                            each_review["teacher_id"] = teacher_id
                            each_review["name"] = review_block.find("span",{"class":"c_name"}).getText()
                            each_review["date"] = review_block.find("span",{"class":"c_time"}).getText()
                            each_review["rating"] = get_rating_from_html(review_block)
                            each_review["review"] = review_block.find("dd").getText().replace("\n"," ").replace("\r"," ")

                            # store this Series "each_review" into "df_reviews"
                            df_reviews = pd.concat([df_reviews, pd.DataFrame([each_review], columns=["teacher_id", "name","date","rating","review"])])

                            # break when all the comments were scraped
                            review_count += 1
                            if (review_count == num_comments):
                                break

                        except Exception as e:
                            print(i,e,each_teacher_url)
                            break
                            
                            
                    time.sleep(1)

                assert review_count == num_comments
        
        # Terinate without writing file if the connection fails in the middle
        except Exception as e:
            print("{} {} {} [in line {}]".format(i, e, each_teacher_url, sys.exc_info()[-1].tb_lineno))
            print("\nDownload interrupted in the middle of {}th iteration.".format(i))
            break
        
    print(time.time() - t)
    
    # convert the type of "teacher_id" into int
    df_reviews.teacher_id = df_reviews.teacher_id.astype(int)

    # convert "date" cell (like "2016年7月8日") into datetime(like "2016-07-08")
    df_reviews.date = pd.to_datetime(df_reviews.date.apply(convert_year_month_day))
            
    # store reviews only when this part is reached to
    print("\nThese {} teachers had no reviews. : {}".format(len(ids_without_reviews), ids_without_reviews))
    conn = sqlite3.connect("reviews.db")
    df_reviews.to_sql("reviews", conn, if_exists="append", index=False)
    print("Reviews for teachers from {} ~ {} were written in the database file".format(first_index, first_index + i -1))

In [269]:
scrape_and_store_reviews_to_db(urls, 0, 100)

0
20
40
54 reviews not found http://eikaiwa.dmm.com/teacher/index/16208/
60
77 reviews not found http://eikaiwa.dmm.com/teacher/index/22149/
80
86 reviews not found http://eikaiwa.dmm.com/teacher/index/25921/
87 reviews not found http://eikaiwa.dmm.com/teacher/index/25982/
88 reviews not found http://eikaiwa.dmm.com/teacher/index/26033/
92 reviews not found http://eikaiwa.dmm.com/teacher/index/26289/
96 reviews not found http://eikaiwa.dmm.com/teacher/index/26709/
5213.241311073303

All data was written in the database file


In [270]:
scrape_and_store_reviews_to_db(urls, 100, 200)

0
0 reviews not found http://eikaiwa.dmm.com/teacher/index/26849/
2 reviews not found http://eikaiwa.dmm.com/teacher/index/26878/
8 reviews not found http://eikaiwa.dmm.com/teacher/index/27023/
10 reviews not found http://eikaiwa.dmm.com/teacher/index/27154/
15 reviews not found http://eikaiwa.dmm.com/teacher/index/27248/
16 reviews not found http://eikaiwa.dmm.com/teacher/index/27252/
17 reviews not found http://eikaiwa.dmm.com/teacher/index/27253/
18 reviews not found http://eikaiwa.dmm.com/teacher/index/27258/
19 reviews not found http://eikaiwa.dmm.com/teacher/index/27275/
20
23 reviews not found http://eikaiwa.dmm.com/teacher/index/27297/
24 reviews not found http://eikaiwa.dmm.com/teacher/index/27298/
25 reviews not found http://eikaiwa.dmm.com/teacher/index/27299/
31 reviews not found http://eikaiwa.dmm.com/teacher/index/27336/
32 reviews not found http://eikaiwa.dmm.com/teacher/index/27395/
37 reviews not found http://eikaiwa.dmm.com/teacher/index/27411/
40
40 reviews not found

In [298]:
scrape_and_store_reviews_to_db(urls, 201, 300)

0
10
20
30
40
50
60
70
80
90
4625.848479986191

These 0 teachers had no reviews. : []

All data was written in the database file


In [301]:
scrape_and_store_reviews_to_db(urls, 301, 400)

0
10
20
30
40
50
60
70
80
90
4469.3694660663605

These 0 teachers had no reviews. : []

All data was written in the database file


In [302]:
scrape_and_store_reviews_to_db(urls, 400, 500)

0
10
20
30
40
50
60
70
80
90
5077.633849143982

These 0 teachers had no reviews. : []

All data was written in the database file


In [307]:
scrape_and_store_reviews_to_db(urls, 500, 600)

0
10
20
30
40
50
60
70
80
90
3526.9483132362366

These 0 teachers had no reviews. : []

All data was written in the database file


In [315]:
scrape_and_store_reviews_to_db(urls, 600, 700)

0
30
60
90
4897.000339984894

These 0 teachers had no reviews. : []

All data was written in the database file


In [319]:
scrape_and_store_reviews_to_db(urls, 700, 800)

0
30
60
90
5075.793924808502

These 0 teachers had no reviews. : []

All data was written in the database file


In [322]:
scrape_and_store_reviews_to_db(urls, 800, 900)

3038.800406932831

These 0 teachers had no reviews. : []

All data was written in the database file


In [326]:
scrape_and_store_reviews_to_db(urls, 900, 1000)

5051.813581228256

These 1 teachers had no reviews. : ['27316']

All data was written in the database file


In [337]:
scrape_and_store_reviews_to_db(urls, 1000, 1100)

2850.2720379829407

These 0 teachers had no reviews. : []

All data was written in the database file


In [372]:
scrape_and_store_reviews_to_db(urls, 1100, 1200)

3352.749478816986

These 1 teachers had no reviews. : ['27418']
All data was written in the database file


In [375]:
scrape_and_store_reviews_to_db(urls, 1200, 1300)

5333.891016960144

These 0 teachers had no reviews. : []
All data was written in the database file


In [11]:
scrape_and_store_reviews_to_db(urls, 1300, 1700)

135 HTTPSConnectionPool(host='eikaiwa.dmm.com', port=443): Max retries exceeded with url: /teacher/tab_ajax_ratecomment/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x11fda9ba8>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',)) http://eikaiwa.dmm.com/teacher/index/3447/

Download interrupted after 135 iteration.
4921.938588142395

These 0 teachers had no reviews. : []
Reviews for teachers from 1300 ~ 1435 were written in the database file


In [76]:
scrape_and_store_reviews_to_db(urls, 1435, 1436)

64.2096619606018

These 0 teachers had no reviews. : []
Reviews for teachers from 1435 ~ 1434 were written in the database file


In [15]:
scrape_and_store_reviews_to_db(urls, 1436, 1700)

11615.472882032394

These 2 teachers had no reviews. : ['23270', '27442']
Reviews for teachers from 1436 ~ 1699 were written in the database file


In [16]:
scrape_and_store_reviews_to_db(urls, 1700, 2000)

115 ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',)) http://eikaiwa.dmm.com/teacher/index/24071/

Download interrupted after 115 iteration.
3922.6277520656586

These 0 teachers had no reviews. : []
Reviews for teachers from 1700 ~ 1815 were written in the database file


In [44]:
scrape_and_store_reviews_to_db(urls, 1815, 1816)

31.038333892822266

These 0 teachers had no reviews. : []
Reviews for teachers from 1815 ~ 1815 were written in the database file


In [23]:
bscrape_and_store_reviews_to_db(urls, 1816, 2000)

110 list index out of range http://eikaiwa.dmm.com/teacher/index/12621/
110 list index out of range http://eikaiwa.dmm.com/teacher/index/12621/
110 list index out of range http://eikaiwa.dmm.com/teacher/index/12621/
110 list index out of range http://eikaiwa.dmm.com/teacher/index/12621/
110 list index out of range http://eikaiwa.dmm.com/teacher/index/12621/
110 list index out of range http://eikaiwa.dmm.com/teacher/index/12621/
110  http://eikaiwa.dmm.com/teacher/index/12621/

Download interrupted after 110 iteration.
4425.486377000809

These 1 teachers had no reviews. : ['27274']
Reviews for teachers from 1816 ~ 1926 were written in the database file


In [57]:
scrape_and_store_reviews_to_db(urls, 1926, 1927)

101.3224949836731

These 0 teachers had no reviews. : []
Reviews for teachers from 1926 ~ 1926 were written in the database file


In [29]:
scrape_and_store_reviews_to_db(urls, 1927, 2000)

19 list index out of range http://eikaiwa.dmm.com/teacher/index/15956/
19 list index out of range http://eikaiwa.dmm.com/teacher/index/15956/
19 list index out of range http://eikaiwa.dmm.com/teacher/index/15956/
19 list index out of range http://eikaiwa.dmm.com/teacher/index/15956/
19 list index out of range http://eikaiwa.dmm.com/teacher/index/15956/
19 ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',)) http://eikaiwa.dmm.com/teacher/index/15956/

Download interrupted after 19 iteration.
695.4262430667877

These 0 teachers had no reviews. : []
Reviews for teachers from 1927 ~ 1946 were written in the database file


In [74]:
scrape_and_store_reviews_to_db(urls, 1946, 2000)

1741.3793730735779

These 0 teachers had no reviews. : []
Reviews for teachers from 1946 ~ 1999 were written in the database file


In [379]:
scrape_and_store_reviews_to_db(urls, 2000, 2500)

148 HTTPSConnectionPool(host='eikaiwa.dmm.com', port=443): Max retries exceeded with url: /teacher/tab_ajax_ratecomment/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x12f0ff588>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',)) http://eikaiwa.dmm.com/teacher/index/12776/

Download interrupted after {} iteration.
5679.514086961746

These 2 teachers had no reviews. : ['24765', '27495']
Reviews for teachers from 2000 ~ 2148 were written in the database file


In [80]:
scrape_and_store_reviews_to_db(urls, 2148, len(urls))

214 HTTPSConnectionPool(host='eikaiwa.dmm.com', port=443): Max retries exceeded with url: /teacher/tab_ajax_ratecomment/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x14704c978>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',)) http://eikaiwa.dmm.com/teacher/index/16556/

Download interrupted after 214 iteration.
7032.174710988998

These 1 teachers had no reviews. : ['27615']
Reviews for teachers from 2148 ~ 2362 were written in the database file


In [81]:
scrape_and_store_reviews_to_db(urls, 2361, len(urls))

420 HTTPSConnectionPool(host='eikaiwa.dmm.com', port=443): Max retries exceeded with url: /teacher/tab_ajax_ratecomment/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x141e2ee10>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',)) http://eikaiwa.dmm.com/teacher/index/22315/

Download interrupted after 420 iteration.
15746.42124080658

These 1 teachers had no reviews. : ['27429']
Reviews for teachers from 2361 ~ 2781 were written in the database file


In [82]:
scrape_and_store_reviews_to_db(urls, 2780, len(urls))

112 list index out of range http://eikaiwa.dmm.com/teacher/index/10350/
112 list index out of range http://eikaiwa.dmm.com/teacher/index/10350/
112  http://eikaiwa.dmm.com/teacher/index/10350/

Download interrupted after 112 iteration.
2708.8592319488525

These 1 teachers had no reviews. : ['27480']
Reviews for teachers from 2780 ~ 2892 were written in the database file


In [88]:
scrape_and_store_reviews_to_db(urls, 2892, len(urls))

119 HTTPSConnectionPool(host='eikaiwa.dmm.com', port=443): Max retries exceeded with url: /teacher/tab_ajax_ratecomment/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x135d60cf8>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',)) http://eikaiwa.dmm.com/teacher/index/7088/ [in line 41]

Download interrupted in the middle of 119th iteration.
5025.752235174179

These 0 teachers had no reviews. : []
Reviews for teachers from 2892 ~ 3010 were written in the database file


In [107]:
scrape_and_store_reviews_to_db(urls, 3010, len(urls))

174 HTTPSConnectionPool(host='eikaiwa.dmm.com', port=443): Max retries exceeded with url: /teacher/tab_ajax_ratecomment/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x151251240>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',)) http://eikaiwa.dmm.com/teacher/index/17522/ [in line 41]

Download interrupted in the middle of 174th iteration.
7173.257016658783

These 0 teachers had no reviews. : []
Reviews for teachers from 3010 ~ 3183 were written in the database file


In [108]:
scrape_and_store_reviews_to_db(urls, 3183, len(urls))

332 HTTPSConnectionPool(host='eikaiwa.dmm.com', port=443): Max retries exceeded with url: /teacher/tab_ajax_ratecomment/ (Caused by SSLError(SSLError("bad handshake: SysCallError(50, 'ENETDOWN')",),)) http://eikaiwa.dmm.com/teacher/index/4973/ [in line 41]

Download interrupted in the middle of 332th iteration.
9400.190793275833

These 10 teachers had no reviews. : ['22042', '26723', '27291', '27417', '27571', '27063', '27127', '27499', '26771', '27531']
Reviews for teachers from 3183 ~ 3514 were written in the database file


In [109]:
scrape_and_store_reviews_to_db(urls, 3514, len(urls))

315 HTTPSConnectionPool(host='eikaiwa.dmm.com', port=443): Max retries exceeded with url: /teacher/tab_ajax_ratecomment/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x15128a978>: Failed to establish a new connection: [Errno 51] Network is unreachable',)) http://eikaiwa.dmm.com/teacher/index/19929/ [in line 41]

Download interrupted in the middle of 315th iteration.
8871.986421823502

These 4 teachers had no reviews. : ['16862', '25708', '27460', '27038']
Reviews for teachers from 3514 ~ 3828 were written in the database file


In [111]:
scrape_and_store_reviews_to_db(urls, 3828, len(urls))

712 ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',)) http://eikaiwa.dmm.com/teacher/index/988/ [in line 41]

Download interrupted in the middle of 712th iteration.
14655.183600902557

These 31 teachers had no reviews. : ['26502', '27115', '27340', '27659', '26835', '27077', '27399', '27560', '27569', '26605', '27070', '27397', '27536', '16979', '19208', '24796', '24616', '24722', '26354', '27526', '27318', '27681', '17673', '27122', '27263', '27432', '27260', '27596', '27089', '27507', '27665']
Reviews for teachers from 3828 ~ 4539 were written in the database file


In [113]:
scrape_and_store_reviews_to_db(urls, 4539, len(urls))

1171.2078258991241

These 95 teachers had no reviews. : ['26404', '26506', '27177', '27288', '27037', '17428', '27064', '27421', '26651', '27544', '27567', '27679', '27370', '26900', '26882', '27236', '27269', '27557', '27576', '27579', '27586', '27589', '27594', '27598', '27600', '27603', '27605', '27608', '27609', '27616', '27617', '27619', '27620', '27621', '27622', '27635', '27636', '27638', '27641', '27648', '27650', '27651', '27652', '27653', '27656', '27660', '27661', '27664', '27667', '27668', '27675', '27678', '27700', '27701', '27708', '27709', '27712', '27713', '27715', '27716', '27721', '27722', '27723', '27724', '27727', '27730', '27731', '27732', '27734', '27735', '27736', '27738', '27739', '27742', '27745', '27747', '27749', '27752', '27754', '27755', '27757', '27758', '27759', '27760', '27761', '27762', '27765', '27766', '27767', '27777', '27779', '27780', '27783', '27786', '27787']
Reviews for teachers from 4539 ~ 4741 were written in the database file


# Load in row teacher info to DataFrame "teachers" from a database file

In [172]:
# load teachers' data from database file to dataframe

conn = sqlite3.connect("reviews.db")
reviews = pd.read_sql("SELECT * FROM reviews",conn)
print(reviews.shape, reviews.teacher_id.nunique())
teachers.head()

(314505, 5) 4558


Unnamed: 0,teacher_id,name,date,rating,review
0,398,RIN,2019-01-05 00:00:00,5,素晴らしい先生です！！
1,398,MAKO,2018-12-21 00:00:00,5,知的で穏やか、そしてとてもお優しく素敵な先生でした。レッスンもテキパキと進めてくださり充実し...
2,398,sunamin,2018-12-18 00:00:00,5,定期的にJules先生のレッスン取っています。とても穏やかで優しい講師だと思います。
3,398,Coco,2018-12-11 00:00:00,5,話しやすく明るい素敵な先生でした。(^^♪
4,398,Cindy,2018-12-05 00:00:00,5,優しくて、話しやすい。発音も聞き取りやすく、生徒の目的に応じたレッスンを提供しようという思い...


# Drop duplicated rows from reviews dataframe / replace 0 to np.nan in rating
# Then store it in "database2.db"

In [180]:
# remove duplicated rows
reviews2 = reviews.copy().drop_duplicates()
reviews2.rating = reviews2.rating.replace(0,np.nan)

# I have found that if one's name is "Nan", then it was taken as null value.
# So I replace them in the "name" column.

reviews2.name = reviews2.name.copy().replace(np.nan, "noname")

reviews2.shape

# save the cleaned dataframe "reviews2" in "database2.db"
conn = sqlite3.connect("reviews2.db")
reviews2.to_sql("reviews", conn, if_exists="replace", index=False)

In [181]:
# just check dataframe without duplicates is in "database2.db"
conn = sqlite3.connect("reviews2.db")
reviews_check = pd.read_sql("SELECT * FROM reviews",conn)
print(reviews_check.shape)

(313058, 5)
