# Imports and Configuration

In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import streamlit as st
import pathlib as pl
import re
import random
import subprocess
from icecream import ic

import spacy
# from spacy.util import minibatch, compounding
import spacy.tokens

plt.style.use('ggplot')
ic.configureOutput(includeContext=True)

# functions

In [121]:
def convert_data(nlp: spacy.Language, labels: list[str], outfile: pl.Path, data:list) -> None:
    db = spacy.tokens.DocBin()
    docs = list()
    
    # convert data do DocBin
    for doc, true_label in nlp.pipe(data, as_tuples=True):
        cat_values = {label: False for label in labels}
        cat_values[true_label] = True
        doc.cats = cat_values
        db.add(doc)

    # save DocBin to disk
    db.to_disk(outfile)
    print(f"worte '{outfile}'")

def extract_rating(row: str, which: str) -> int:
    pattern = re.compile(r"'{}':\s?(\d+\.\d+)".format(which))
    match_object = re.search(pattern, row)
    if match_object:
        rating = float(match_object.group(1))
        rating = int(rating)
        rating = str(rating)
        return rating
    else:
        return -1

def make_predictions(text: str, nlp: spacy.Language) -> int:
    doc = nlp(text)
    cat_values = list(doc.cats.values())
    cat_labels = list(doc.cats.keys())
    rating = int(cat_labels[cat_values.index(max(cat_values))])
    return rating

# Getting dataset for trainingdata

In [2]:
file_path = pl.Path(r"C:\Users\milit\Documents\python\Data_Analytics\ProjectWoche\Data\trip_advisor_reviews.zip")
data = pd.read_csv(file_path, compression="zip")
data.head()

Unnamed: 0,ratings,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id,via_mobile
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,93338,0,2012-12-17,147643103,False
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,93338,0,2012-12-17,147639004,False
2,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Great Stay”,This is a great property in Midtown. We two di...,"{'username': 'vuguru', 'num_cities': 12, 'num_...",December 2012,1762573,0,2012-12-18,147697954,False
3,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“Modern Convenience”,The Andaz is a nice hotel in a central locatio...,"{'username': 'Hotel-Designer', 'num_cities': 5...",August 2012,1762573,0,2012-12-17,147625723,False
4,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Its the best of the Andaz Brand in the US....”,I have stayed at each of the US Andaz properti...,"{'username': 'JamesE339', 'num_cities': 34, 'n...",December 2012,1762573,0,2012-12-17,147612823,False


## Exploring for data important for training
- ratings: extracted overall
- text

- explore missing values of text

In [3]:
train_df = data[["ratings", "text"]]

# extract the overall rating from ratings
train_df["overall_rating"] = train_df["ratings"].apply(extract_rating, args=("overall",))

# data exploration
missing_values_text = train_df[train_df["text"].isna()]
missing_values_ratings = train_df[train_df["ratings"].isna()]
print("exploring missing values for cols 'ratings' and 'text'")
print(f"missing values ratings: {missing_values_ratings.shape[0] / train_df['text'].shape[0]}%")
print(f"missing values text: {missing_values_text.shape[0] / train_df['text'].shape[0]}%")


exploring missing values for cols 'ratings' and 'text'
missing values ratings: 0.0%
missing values text: 0.0%


# Getting dataset for prediction

In [96]:
file_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\data\cities\reviews_antwerp.csv.gz")
prediction_data_raw = pd.read_csv(file_path, compression="gzip")
prediction_data = prediction_data_raw[["date", "reviewer_name", "comments"]]
prediction_data["pedicted_rating"] = prediction_data.apply(lambda row: -1)
prediction_data.head()

Unnamed: 0,date,reviewer_name,comments,pedicted_rating
0,2015-05-06,Jihae,Karin’s “Aplace” is absolutely beautiful and c...,
1,2021-10-10,Emilie,"Karin is a wonderful host, she was really help...",
2,2022-05-15,Marie-Lou,The location is super super nice! Karin was al...,
3,2012-02-20,Hiske & Erik,"Perfect location for exploring the city, close...",
4,2012-03-05,Paolo,"Muriel was such a fantastic host, extremely he...",


## Explore predictiondata

In [97]:
prediction_data.shape

(102902, 4)

# Train spacy from CLI

## setup spacy pipeline

In [113]:
nlp = spacy.blank("en")

## create base_config.cfg

In [115]:
config_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_config")
base_config_file = pl.Path("base_config.cfg")
!python -m spacy init config {config_path/base_config_file} --pipeline textcat --optimize efficiency --force

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: textcat
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_config\base_config.cfg
You can now add your data and train your pipeline:
python -m spacy train base_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


## create config.cfg from base_config.cfg and fill with default values

In [116]:
config_file = pl.Path("config.cfg")
!python -m spacy init fill-config {config_path/base_config_file} {config_path/config_file}

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_config\config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


## prepare training data

In [117]:
train_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_train")
train_file = pl.Path("train.spacy")
dev_file = pl.Path("dev.spacy")
test_file = pl.Path("test.spacy")
labels = ["1", "2", "3", "4", "5"]
train_split = 0.75
dev_split = 0.9

# get a random sample from original data for testing purpose
sample_data = train_df.sample(n=10, axis=0)

# convert data to list and shuffle the data
sample_data = list(
    sample_data[["text", "overall_rating"]].sample(frac=1).itertuples(index=False, name=None)
)

# split the data into training-, evaluation-, and testdata

train_split = int(train_split * len(sample_data))
dev_split = int(dev_split * len(sample_data))
train_data = sample_data[:train_split]
dev_data = sample_data[train_split:dev_split]
test_data = sample_data[dev_split:]
print(f"len train data: {len(train_data)}")
print(f"len dev data: {len(dev_data)}")
print(f"len test data: {len(test_data)}")


# convert training data
convert_data(
    nlp=nlp,
    labels=labels,
    outfile=train_path / train_file, 
    data=train_data, 
)

# convert dev data
convert_data(
    nlp=nlp,
    labels=labels,
    outfile=train_path / dev_file, 
    data=dev_data, 
)

# convert test data
convert_data(
    nlp=nlp,
    labels=labels,
    outfile=train_path / test_file, 
    data=test_data, 
)

len train data: 7
len dev data: 2
len test data: 1
worte 'C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_train\train.spacy'
worte 'C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_train\dev.spacy'
worte 'C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_train\test.spacy'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["overall_rating"] = train_df["ratings"].apply(extract_rating, args=("overall",))


## Train model

In [118]:
output_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_model")
!python -m spacy train {config_path/config_file} --paths.train {train_path/train_file} --paths.dev {train_path/dev_file} --output {output_path} --verbose

[38;5;4mℹ Saving to output directory:
C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_model[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.16       13.33    0.13
 28     200          2.23        0.00    0.00
 60     400          0.01        0.00    0.00
 93     600          0.00        0.00    0.00
126     800          0.00        0.00    0.00
157    1000          0.00        0.00    0.00
188    1200          0.00        0.00    0.00
218    1400          0.00        0.00    0.00
251    1600          0.00        0.00    0.00
[38;5;2m✔ Saved pipeline to output directory[0m
C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_model\model-last


[2024-06-18 00:46:20,613] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[2024-06-18 00:46:20,855] [INFO] Set up nlp object from config
[2024-06-18 00:46:20,869] [DEBUG] Loading corpus from path: C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_train\dev.spacy
[2024-06-18 00:46:20,870] [DEBUG] Loading corpus from path: C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_train\train.spacy
[2024-06-18 00:46:20,870] [INFO] Pipeline: ['textcat']
[2024-06-18 00:46:20,874] [INFO] Created vocabulary
[2024-06-18 00:46:20,874] [INFO] Finished initializing nlp object
[2024-06-18 00:46:20,943] [INFO] Initialized pipeline components: ['textcat']
[2024-06-18 00:46:20,956] [DEBUG] Loading corpus from path: C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_train\dev.spacy
[2024-06-18 00:46:20,957] [DEBUG] Loading corpus from path: C:\Users\milit\Desktop\Alfatra

## Evaluate the model

In [119]:
model_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_model")
model_file = pl.Path("model-best")
!python -m spacy benchmark accuracy {model_path/model_file} {train_path/test_file}

[38;5;4mℹ Using CPU[0m
[1m

TOK                 100.00
TEXTCAT (macro F)   0.00  
SPEED               24125 

[1m

       P      R      F
1   0.00   0.00   0.00
2   0.00   0.00   0.00
3   0.00   0.00   0.00
4   0.00   0.00   0.00
5   0.00   0.00   0.00

[1m

    ROC AUC
1      None
2      None
3      None
4      None
5      None



## Make predictions

In [120]:
# load the model with highest accuracy
nlp = spacy.load(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_model\model-last")

# test a simple expression before predicting AirBnB-ratings
texts_good = ["this is a very beautiful place for a stay in this lovely city"]
texts_bad = ["A really, really terrible place "]

print("pedict rating for good texts")
for text in texts_good:
    doc = nlp(text)
    print(f"rating: {doc.cats} -- text: {text}")

print("pedict rating for bad texts")
for text in texts_bad:
    doc = nlp(text)
    print(f"rating: {doc.cats} -- text: {text}")


# finally predict the AirBnB ratings


prediction_sample = prediction_data.sample(n=100, axis=0)
prediction_sample["pedicted_rating"] = prediction_sample["comments"].apply(make_predictions, args=(nlp,))

pedict rating for good texts
rating: {'1': 0.1240428164601326, '2': 0.20191678404808044, '3': 0.240362286567688, '4': 0.212214395403862, '5': 0.22146371006965637} -- text: this is a very beautiful place for a stay in this lovely city
pedict rating for bad texts
rating: {'1': 0.18104076385498047, '2': 0.19354777038097382, '3': 0.21970169246196747, '4': 0.21357141435146332, '5': 0.19213831424713135} -- text: A really, really terrible place 
