In [1]:
import os
import sys
from pathlib import Path
import re
import pandas as pd
import numpy as np
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.database_handler import DatabaseHandler
from src.utils import timer

import matplotlib.pyplot as plt
import matplotlib

import random
import spacy
# will only work if
# CUDA installed: https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exelocal
# spacey with cupy installed. Either with pip install -U spacy[cuda111] (where as [versionodcudayouuse])
# or over: https://www.lfd.uci.edu/~gohlke/pythonlibs/#cupy
# and: https://www.lfd.uci.edu/~gohlke/pythonlibs/#spacy
spacy.prefer_gpu()
spacy.require_gpu()
from spacy.util import minibatch, compounding

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Tue_Sep_15_19:12:04_Pacific_Daylight_Time_2020
Cuda compilation tools, release 11.1, V11.1.74
Build cuda_11.1.relgpu_drvr455TC455_06.29069683_0


In [3]:
spacy.__version__

'2.3.5'

In [4]:
# CuPy
import cupy
cupy.show_config()

OS                           : Windows-10-10.0.18362-SP0
CuPy Version                 : 9.0.0b1
NumPy Version                : 1.18.3
SciPy Version                : 1.5.4
Cython Build Version         : 0.29.21
Cython Runtime Version       : None
CUDA Root                    : C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1
CUDA Build Version           : 11010
CUDA Driver Version          : 11020
CUDA Runtime Version         : 11010
cuBLAS Version               : 11201
cuFFT Version                : 10300
cuRAND Version               : 10202
cuSOLVER Version             : (11, 0, 0)
cuSPARSE Version             : 11200
NVRTC Version                : (11, 1)
Thrust Version               : 100910
CUB Build Version            : 100910
Jitify Build Version         : <unknown>
cuDNN Build Version          : None
cuDNN Version                : None
NCCL Build Version           : None
NCCL Runtime Version         : None
cuTENSOR Version             : None
Device 0 Name                

In [5]:
class CommentEvaluator:
    
    def __init__(self):
        self.__batch_limit = 100000
        self.__offset = 0
        self.__handler = DatabaseHandler()
        self.__query = self.__handler.querry_database
        self.__insert_many = self.__handler.insert_many
        self.__cursor_init = self.__handler.init_fetch_many_cursor
        self.__fetch_batch = self.__handler.fetch_cursor_batch
        self.__create_eval_column()
        print("Ready to go!")
        
    def __del__(self):
        self.__handler.close_database()
        
    def get_review_data(self, limit:int = 100000):
        sql = """Select rid, star, text from reviews limit %(limit)s"""
        opt = {"limit": limit}
        self.__cursor_init(sql, opt)
        data = self.__fetch_batch()
        df_data = pd.DataFrame(data, columns = ["rid", "star", "text"])
        return df_data
    
    def querry_review_data(self):
        return pd.DataFrame(self.__fetch_batch(), columns = ["rid", "star", "text"])
    
    def querry_tip_data(self):
        sql = """Select tid, text from tips order by tid::int limit %(limit)s offset %(offset)s"""
        opt = {"limit": self.__batch_limit, "offset": self.__offset}
        data = self.__query(sql, opt)
        df_data = pd.DataFrame(data, columns = ["tid", "text"])
        self.__offset += self.__batch_limit
        if len(df_data) == 0:
            self.__offset = 0
        return df_data
    
    def add_sentiment_data(self, df):
        good_words = "love|good|awesome|best|pleasant|nice|good|delicious|favorite|wonderful|amazing|great|nice|helpful"
        bad_words = "horrified|never again|horrible|terrible|very poor|bad|worst|angry|dirty|crappy|overpriced|rip off|scammer|scammers"
        good_mask = df["text"].str.contains(good_words, case=False, regex=True)
        bad_mask = df["text"].str.contains(bad_words, case=False, regex=True)
        df["sentiment"] = "Neutral"
        df["sentiment"][good_mask] = "Good"
        df["sentiment"][bad_mask] = "Bad"
    
    def __create_eval_column(self):
        sql = """ALTER TABLE reviews ADD COLUMN sentiment varchar(20)"""
        try:
            self.__query(sql)
            print('Added column "sentiment" to "reviews"')
        # will throw Exception if col already exists
        except Exception as exep:
            print(exep)

In [6]:
ce = CommentEvaluator()

column "sentiment" of relation "reviews" already exists

Ready to go!


In [None]:
rdf = ce.get_review_data()
rdf.head()

In [7]:
rdf["pos"] = rdf["star"].apply(lambda x: (x>=3))
rdf["neg"] = rdf["star"].apply(lambda x: (x<3))
rdf["sentiment"] = rdf["star"].apply(lambda x: (x>=3)*1)

rdf.head()

Unnamed: 0,rid,star,text,pos,neg,sentiment
0,Xl-QLVwmxTp-LSUoeL2h0A,3,"This was my first time at this location, but I...",True,False,1
1,cUOVNgHnd2POaXQAUiHtAg,5,The entire experience at Smile Reef Dental is ...,True,False,1
2,N7QyxScUMA4J0cGwxXr7gQ,4,Safest lunch ayce in the hood. Best chicken wi...,True,False,1
3,YkoEIFKUOsprypvTQiZpEA,5,I think Prime deserves a 5 for their yummy int...,True,False,1
4,Q7O8GhmOYr5MX2jcQZAbMQ,4,"Growing up in socal, inn-n-out is the standard...",True,False,1


In [5]:
# text = rdf.iloc[0]["text"]
# nlp = spacy.load("en_core_web_sm")
# doc = nlp(text)
# token_list = [token for token in doc]
# filtered_tokens = [token for token in doc if not token.is_stop]
# filtered_tokens

In [6]:
# lemmas = [
#    f"Token: {token}, lemma: {token.lemma_}"
#     for token in filtered_tokens
# ]
# lemmas

In [7]:
def define_training_data(
    df: pd.DataFrame,
    split: float = 0.8,
    limit: int = 0
) -> tuple:
    reviews = []
    for _, review in df.iterrows():
        text = review["text"]
        text = text.replace("<br />", "\n\n")
        if text.strip():
            spacy_label = {
                "cats": {
                    "pos": review["pos"],
                    "neg": review["neg"],
                }
            }
            reviews.append((text, spacy_label))
    random.shuffle(reviews)

    if limit:
        reviews = reviews[:limit]
    split = int(len(reviews) * split)
    return reviews[:split], reviews[split:]

In [8]:
def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8  # Can't be 0 because of presence in denominator
    true_negatives = 0
    false_negatives = 1e-8
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]["cats"]
        for predicted_label, score in review.cats.items():
            # Every cats dictionary includes both labels. You can get all
            # the info you need with just the pos label.
            if predicted_label == "neg":
                continue
            if score >= 0.5 and true_label["pos"]:
                true_positives += 1
            elif score >= 0.5 and true_label["neg"]:
                false_positives += 1
            elif score < 0.5 and true_label["neg"]:
                true_negatives += 1
            elif score < 0.5 and true_label["pos"]:
                false_negatives += 1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}

In [9]:
def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 20
) -> None:
    # Build pipeline
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe("textcat", config={"architecture": "simple_cnn"})
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

    # Train only textcat
    training_excluded_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    
    with nlp.disable_pipes(training_excluded_pipes):
        optimizer = nlp.begin_training()
        # Training loop
        print("Beginning training")
        print("Loss\tPrecision\tRecall\tF-score")
        batch_sizes = compounding(4.0, 32.0, 1.001)  # A generator that yields infinite series of input numbers
        for i in range(iterations):
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(text, labels,drop=0.2,sgd=optimizer,losses=loss)
            with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(
                    tokenizer=nlp.tokenizer,
                    textcat=textcat,
                    test_data=test_data
                )
                print(
                    f"{loss['textcat']:.4f}\t{evaluation_results['precision']:.5f}"
                    f"\t\t{evaluation_results['recall']:.5f}"
                    f"\t{evaluation_results['f-score']:.5f}"
                )
    # Save model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk("model_artifacts")

In [10]:
def test_model(loaded_model, input_data: str, printdata=False):
    # Generate prediction
    parsed_text = loaded_model(input_data)
    # Determine prediction to return
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "Positive"
        score = parsed_text.cats["pos"]
    else:
        prediction = "Negative"
        score = parsed_text.cats["neg"]
    if printdata:
        print(
            f"Review text: {input_data}\nPredicted sentiment: {prediction}"
            f"\tScore: {score}"
        )
    return [prediction, score]

In [11]:
# testtext = 'Great place!  We went at night and the place was bouncing with people... Mostly a young crowd.. Not a family place at night.. More like a night club.  We went for food,  ordered the carne asada street tacos,  and the Jefe Burger.  Both just perfect.  The tacos were at point,  juicy seasoned meat on small corn tortillas.  The meat on the Burger was tender and pretty juicy. Friendly staff. I would return!'

# train, test = define_training_data(rdf, limit=20000)
# train_model(train, test)
# print("Testing model")
# loaded_model = spacy.load("model_artifacts")
# test_model(loaded_model, testtext, True)

Beginning training
Loss	Precision	Recall	F-score
10.4908	0.91425		0.94591	0.92981
0.1099	0.92931		0.94525	0.93721
0.0494	0.93922		0.94294	0.94108
0.0457	0.93924		0.94327	0.94125
0.0381	0.94266		0.93799	0.94032
0.0343	0.93910		0.94096	0.94003
0.0303	0.94003		0.94096	0.94050
0.0279	0.93962		0.93931	0.93947
0.0257	0.93787		0.94096	0.93941
0.0238	0.93649		0.93865	0.93757
0.0210	0.93799		0.93799	0.93799
0.0203	0.93645		0.93799	0.93722
0.0168	0.93637		0.93668	0.93652
0.0177	0.93627		0.93997	0.93812
0.0133	0.93775		0.93898	0.93837
0.0149	0.93771		0.93832	0.93802
0.0142	0.93748		0.93964	0.93856
0.0136	0.93828		0.93766	0.93797
0.0138	0.93655		0.93964	0.93810
0.0122	0.93533		0.93964	0.93748
Testing model
Review text: Great place!  We went at night and the place was bouncing with people... Mostly a young crowd.. Not a family place at night.. More like a night club.  We went for food,  ordered the carne asada street tacos,  and the Jefe Burger.  Both just perfect.  The tacos were at point,  juicy 

['Positive', 0.9996449947357178]

In [11]:
tip_df = ce.querry_tip_data()

loaded_model = spacy.load("model_artifacts")
dh = DatabaseHandler()
sql = """ALTER TABLE tips ADD COLUMN sentiment varchar(20), ADD COLUMN score DECIMAL(10,9)"""
try:
    dh.querry_database(sql)
    print('Added column "sentiment" and "score" to "tips"')
except Exception as exep:
    print(exep)
    
sql_insert =  """UPDATE tips 
set sentiment = data.sentiment,
score = data.score
from (values %s) as data (tid, sentiment, score)
where tips.tid = data.tid::CHAR(22)
"""

fetched_batch = 1

while len(tip_df) != 0:
    print(f"--- Estimating for batch no. {fetched_batch} ---")
    
    total_data = len(tip_df)
    steps = int(total_data/100)
    tenths = int(total_data/10)
    
    calculated_data = []
    
    for number, (index, row) in enumerate(tip_df.iterrows()):
        if number % tenths == 0:
            print(f"{number/total_data*100:.0f}", end="")
        elif number % steps == 0:
            print(".", end="")
        prediction, score = test_model(loaded_model, row["text"])
        calculated_data.append((row["tid"], prediction, score,))
        
    print("\n")
    dh.insert_many(sql_insert, calculated_data)
        
    tip_df = ce.querry_tip_data()
    fetched_batch += 1

print("done!")

Elapsed time for 'querry_database': 0.5897 seconds
column "sentiment" of relation "tips" already exists

--- Estimating for batch no. 1 ---
0.........10.........20.........30.........40.........50.........60.........70.........80.........90.........

Elapsed time for 'insert_many': 8.0517 seconds
Elapsed time for 'querry_database': 0.5303 seconds
--- Estimating for batch no. 2 ---
0.........10.........20.........30.........40.........50.........60.........70.........80.........90.........

Elapsed time for 'insert_many': 6.3962 seconds
Elapsed time for 'querry_database': 0.5426 seconds
--- Estimating for batch no. 3 ---
0.........

KeyboardInterrupt: 