# Imports and Configuration

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import streamlit as st
import pathlib as pl
import re
import random
import subprocess
from icecream import ic

import spacy
from spacy.util import minibatch, compounding

plt.style.use('ggplot')
ic.configureOutput(includeContext=True)

# Getting Sampledata

In [2]:
file_path = pl.Path(r"C:\Users\milit\Documents\python\Data_Analytics\ProjectWoche\Data\trip_advisor_reviews.zip")
data = pd.read_csv(file_path, compression="zip")
data.head()

Unnamed: 0,ratings,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id,via_mobile
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,93338,0,2012-12-17,147643103,False
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,93338,0,2012-12-17,147639004,False
2,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Great Stay”,This is a great property in Midtown. We two di...,"{'username': 'vuguru', 'num_cities': 12, 'num_...",December 2012,1762573,0,2012-12-18,147697954,False
3,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“Modern Convenience”,The Andaz is a nice hotel in a central locatio...,"{'username': 'Hotel-Designer', 'num_cities': 5...",August 2012,1762573,0,2012-12-17,147625723,False
4,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Its the best of the Andaz Brand in the US....”,I have stayed at each of the US Andaz properti...,"{'username': 'JamesE339', 'num_cities': 34, 'n...",December 2012,1762573,0,2012-12-17,147612823,False


# Exploring for data important for training
- ratings: extracted overall
- text

- explore missing values of text

In [3]:
train_df = data[["ratings", "text"]]
missing_values_text = train_df[train_df["text"].isna()]
missing_values_ratings = train_df[train_df["ratings"].isna()]
print("exploring missing values for cols 'ratings' and 'text'")
print(f"missing values ratings: {missing_values_ratings.shape[0] / train_df['text'].shape[0]}%")
print(f"missing values text: {missing_values_text.shape[0] / train_df['text'].shape[0]}%")


exploring missing values for cols 'ratings' and 'text'
missing values ratings: 0.0%
missing values text: 0.0%


In [4]:
def extract_rating(row: str, which: str) -> int:
    pattern = re.compile(r"'{}':\s?(\d+\.\d+)".format(which))
    match_object = re.search(pattern, row)
    if match_object:
        rating = float(match_object.group(1))
        rating = int(rating)
        return rating
    else:
        return -1

train_df["overall_rating"] = train_df["ratings"].apply(extract_rating, args=("overall",))
train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["overall_rating"] = train_df["ratings"].apply(extract_rating, args=("overall",))


Unnamed: 0,ratings,text,overall_rating
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",Stayed in a king suite for 11 nights and yes i...,5
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","On every visit to NYC, the Hotel Beacon is the...",5
2,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",This is a great property in Midtown. We two di...,4
3,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",The Andaz is a nice hotel in a central locatio...,4
4,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",I have stayed at each of the US Andaz properti...,4


# Building Spacy-pipeline

### Functions

In [31]:
def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
    pass

def train_test_data(data: list, labels: list, split: float = 0.8) -> list:

    # shuffle data
    random.shuffle(data)
    texts, true_labels = zip(*data)

    # preparing labels for training_data
    cats = list()
    for true_label in true_labels:
        cat_values = {label: False for label in labels}
        cat_values[true_label] = True
        cats.append(cat_values)

    cats = [{"cats": cat} for cat in cats]
    
    # bring data in final training_data format
    training_data = list(zip(texts, cats))
    out = training_data

    # split the data
    split = int(len(data) * split)
    train_split = training_data[:split]
    test_split = training_data[split:]

    return train_split, test_split
    

def train_model(training_data: list, test_data: list, iteration: int = 20) -> None:
    pass



### spacy setup

In [6]:
# use only a small subset of 'data' for developement purpose
sample_data = train_df.sample(n=100, axis=0)
sample_data["overall_rating"] = sample_data["overall_rating"].astype(dtype=str)
labels = ["1", "2", "3", "4", "5"]

nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

# add a textcategoration to the pipeline 
if "textcat" not in nlp.pipe_names:
    textcat = nlp.add_pipe("textcat")
else:
    textcat = nlp.get_pipe("textcat")
nlp.pipe_names

for label in labels:
    textcat.add_label(label)

# train only the textcat component
training_exclude_pipes = [
    pipe for pipe in nlp.pipe_names if pipe != "textcat"
]


### prepare data for training

In [7]:
sample_data["tuples"] = sample_data.apply(lambda row: (row["text"], row["overall_rating"]), axis=1)
train_data = sample_data["tuples"].to_list()
train_split, test_split = train_test_data(data=train_data, labels=labels)
print(f"length train_data: {len(train_data)}")
print(f"length train_split: {len(train_split)}")
print(f"length test_split: {len(test_split)}")

length train_data: 100
length train_split: 80
length test_split: 20


### train the text-categorizer (textcat)

In [None]:
n_iter = 100
batchsize = 5
batchcount = float(len(train_split) / batchsize)
with nlp.disable_pipes(*training_exclude_pipes): # train only the textcat component of the pipe
    # optimizer = nlp.begin_training()
    optimizer = nlp.create_optimizer()

    print("Train model...")
    print(f"Loss")

    # perform training
    for i in range(n_iter):
        losses = dict()
        batches = minibatch(train_split, size=compounding(batchsize, batchcount, 1.))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts, 
                annotations, 
                sgd=optimizer, 
                drop=0.2, losses=losses
            )
        print(f"{losses['textcat']}")

# Train spacy from CLI

### functions

In [32]:
def cmd(args: list, verbose: bool = False) -> subprocess.CompletedProcess:
    sub = subprocess.run(args, shell=True, text=True, stdout=subprocess.PIPE)
    if verbose:
        print(sub.stdout)
    return sub.stdout

def extract_rating(row: str, which: str) -> int:
    pattern = re.compile(r"'{}':\s?(\d+\.\d+)".format(which))
    match_object = re.search(pattern, row)
    if match_object:
        rating = float(match_object.group(1))
        rating = int(rating)
        rating = str(rating)
        return rating
    else:
        return -1


### create base_config.cfg

In [24]:
config_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_config")
base_config_file = pl.Path("base_config.cfg")

args = [
    "spacy", "init", "config",
    config_path / base_config_file,
    "--pipeline", "textcat",
    "--optimize", "efficiency"
]
cmd(args, verbose=True)

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: textcat
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_config\base_config.cfg
You can now add your data and train your pipeline:
python -m spacy train base_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy



'\x1b[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),\ninstall the spacy-transformers package and re-run this command. The config\ngenerated now does not use transformers.\x1b[0m\n\x1b[38;5;4mℹ Generated config template specific for your use case\x1b[0m\n- Language: en\n- Pipeline: textcat\n- Optimize for: efficiency\n- Hardware: CPU\n- Transformer: None\n\x1b[38;5;2m✔ Auto-filled config with all values\x1b[0m\n\x1b[38;5;2m✔ Saved config\x1b[0m\nC:\\Users\\milit\\Desktop\\Alfatraining\\Data_Scientist\\02_Data_Analytics\\04_Projekt_Woche\\spacy_config\\base_config.cfg\nYou can now add your data and train your pipeline:\npython -m spacy train base_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy\n'

### create config.cfg from base_config.cfg and fill with default values

In [29]:
config_file = pl.Path("config.cfg")
args = [
    "spacy", "init", "fill-config",
    config_path / base_config_file,
    config_path / config_file,
]
cmd(args, verbose=True)

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_config\config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy



'\x1b[38;5;3m⚠ Nothing to auto-fill: base config is already complete\x1b[0m\n\x1b[38;5;2m✔ Saved config\x1b[0m\nC:\\Users\\milit\\Desktop\\Alfatraining\\Data_Scientist\\02_Data_Analytics\\04_Projekt_Woche\\spacy_config\\config.cfg\nYou can now add your data and train your pipeline:\npython -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy\n'

### prepare training data

In [33]:
train_df["overall_rating"] = train_df["ratings"].apply(extract_rating, args=("overall",))
sample_data = train_df.sample(n=100, axis=0)
sample_data["tuples"] = sample_data.apply(lambda row: (row["text"], row["overall_rating"]), axis=1)
train_data = sample_data["tuples"].to_list()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["overall_rating"] = train_df["ratings"].apply(extract_rating, args=("overall",))
