# General Part

## Imports and Configuration

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import streamlit as st
import pathlib as pl
import re
import sys
from icecream import ic
from typing import IO, Tuple, Callable, Dict, Any, Optional

import spacy
import spacy.tokens

plt.style.use('ggplot')
ic.configureOutput(includeContext=True)

## Functions

In [4]:
def convert_data(nlp: spacy.Language, labels: list[str], outfile: pl.Path, data:list) -> None:
    db = spacy.tokens.DocBin()
    
    # convert data do DocBin
    for doc, true_label in nlp.pipe(data, as_tuples=True):
        cat_values = {label: False for label in labels}
        cat_values[true_label] = True
        doc.cats = cat_values
        db.add(doc)

    # save DocBin to disk
    db.to_disk(outfile)
    print(f"wrote '{outfile}'")

def convert_to_rating(text: str, nlp: spacy.Language) -> int:
    doc = nlp(text)
    cat_values = list(doc.cats.values())
    cat_labels = list(doc.cats.keys())
    rating = int(cat_labels[cat_values.index(max(cat_values))])
    return rating

def extract_rating(row: str, which: str) -> int:
    pattern = re.compile(r"'{}':\s?(\d+\.\d+)".format(which))
    match_object = re.search(pattern, row)
    if match_object:
        rating = float(match_object.group(1))
        rating = int(rating)
        rating = str(rating)
        return rating
    else:
        return -1

# Data Gathering

## Loading trainingdata

In [5]:
# file_path = pl.Path(r"C:\Users\milit\Documents\python\Data_Analytics\ProjectWoche\Data\trip_advisor_reviews.zip")
file_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projektwoche\data\machine_learning\kaggle_data_reduced.csv.gz")
training_data_raw = pd.read_csv(file_path, compression="gzip")
print(f"shape of training_data_raw: {training_data_raw.shape}")
print(f"data types: {training_data_raw.dtypes}")
training_data_raw.head()

shape of training_data_raw: (41981, 3)
data types: Review    object
Rating     int64
Origin    object
dtype: object


Unnamed: 0,Review,Rating,Origin
0,nice hotel expensive parking got good deal sta...,4,thedevastator
1,ok nothing special charge diamond member hilto...,2,thedevastator
2,nice rooms not 4* experience hotel monaco seat...,3,thedevastator
3,unique \tgreat stay \twonderful time hotel mon...,5,thedevastator
4,great stay great stay \twent seahawk game awes...,5,thedevastator


## Prepare trainingdata

In [6]:
training_data = training_data_raw[["Review", "Rating"]]
training_data["Rating"] = training_data["Rating"].apply(lambda x: str(x))

# make a training_sample
training_sample = training_data.sample(frac=1, axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data["Rating"] = training_data["Rating"].apply(lambda x: str(x))


## Loading predictiondata

In [7]:
file_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projektwoche\data\cities\reviews_Antwerp_en.csv.gz")
prediction_data_raw = pd.read_csv(file_path, compression="gzip")
prediction_data_raw.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,50904,31511792,2015-05-06,19482395,Jihae,Karin’s “Aplace” is absolutely beautiful and c...
1,50904,470101024356869935,2021-10-10,333559,Emilie,"Karin is a wonderful host, she was really help..."
2,50904,627287279025726941,2022-05-15,32701854,Marie-Lou,The location is super super nice! Karin was al...
3,224682,933043,2012-02-20,1422043,Hiske & Erik,"Perfect location for exploring the city, close..."
4,224682,970457,2012-03-05,1493171,Paolo,"Muriel was such a fantastic host, extremely he..."


## Exploring predictiondata

In [8]:
prediction_data = prediction_data_raw[["reviewer_name", "comments"]]
prediction_data["pedicted_rating"] = prediction_data.apply(lambda row: -1)
prediction_data.head()

print("Shape of the predictiondata: {prediction_data.shape}")

missing_values_comments = prediction_data[prediction_data["comments"].isna()]
print("exploring missing values for col 'comments'")
print(f"missing values ratings: {missing_values_comments.shape[0] / prediction_data['comments'].shape[0]}%")

Shape of the predictiondata: {prediction_data.shape}
exploring missing values for col 'comments'
missing values ratings: 0.0%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_data["pedicted_rating"] = prediction_data.apply(lambda row: -1)


## Prepare predictiondata

In [9]:

# get a random sample from original data for testing purpose
prediction_sample = prediction_data.sample(frac=1, axis=0)

# Training Setup

## setup spacy pipeline

In [15]:
nlp = spacy.blank("en")
# nlp = spacy.load("en_core_web_lg")

## create base_config.cfg

In [17]:
config_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_config")
base_config_file = pl.Path("base_config.cfg")
!python -m spacy init config {config_path/base_config_file} --pipeline textcat --optimize efficiency --force

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: textcat
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_config\base_config.cfg
You can now add your data and train your pipeline:
python -m spacy train base_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


## create config.cfg from base_config.cfg and fill with default values

In [18]:
config_file = pl.Path("config.cfg")
!python -m spacy init fill-config {config_path/base_config_file} {config_path/config_file}

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_config\config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


## prepare training data

In [30]:
train_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projektwoche\spacy_train")
train_file = pl.Path("train.spacy")
dev_file = pl.Path("dev.spacy")
test_file = pl.Path("test.spacy")
labels = ["1", "2", "3", "4", "5"]
train_split = 0.75
dev_split = 0.9

# convert data to list and shuffle the data
training_sample_list = list(
    training_sample[["Review", "Rating"]].sample(frac=1).itertuples(index=False, name=None)
)

# split the data into training-, evaluation-, and testdata
train_split = int(train_split * len(training_sample_list))
dev_split = int(dev_split * len(training_sample_list))
train_data = training_sample_list[:train_split]
dev_data = training_sample_list[train_split:dev_split]
test_data = training_sample_list[dev_split:]
print(f"len train data: {len(train_data)}")
print(f"len dev data: {len(dev_data)}")
print(f"len test data: {len(test_data)}")

len train data: 31485
len dev data: 6297
len test data: 4199


In [31]:
# convert training data
convert_data(
    nlp=nlp,
    labels=labels,
    outfile=train_path / train_file, 
    data=train_data, 
)

# convert dev data
convert_data(
    nlp=nlp,
    labels=labels,
    outfile=train_path / dev_file, 
    data=dev_data, 
)

# convert test data
convert_data(
    nlp=nlp,
    labels=labels,
    outfile=train_path / test_file, 
    data=test_data, 
)

worte 'C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projektwoche\spacy_train\train.spacy'
worte 'C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projektwoche\spacy_train\dev.spacy'
worte 'C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projektwoche\spacy_train\test.spacy'


## Train model

In [None]:
# output_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_model")
# functions_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projekt_Woche\spacy_functions.py")
# !python -m spacy train {config_path/config_file} --paths.train {train_path/train_file} --paths.dev {train_path/dev_file} --output {output_path} --verbose --code {functions_path}

## Evaluate the model

In [4]:
model_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projektwoche\spacy_model")
train_path = pl.Path(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projektwoche\spacy_train")
test_file = "test.spacy"
model_file = "model-best"
!python -m spacy benchmark accuracy {model_path/model_file} {train_path/test_file}

[38;5;4mℹ Using CPU[0m




[1m

TOK                 100.00
TEXTCAT (macro F)   80.34 
SPEED               14042 

[1m

        P       R       F
1   89.49   86.84   88.15
2   80.06   77.71   78.87
3   78.97   66.81   72.39
4   76.06   73.40   74.71
5   84.82   90.59   87.61

[1m

    ROC AUC
1      0.99
2      0.96
3      0.92
4      0.90
5      0.94



## Make predictions

In [10]:
# load the model with highest accuracy
nlp = spacy.load(r"C:\Users\milit\Desktop\Alfatraining\Data_Scientist\02_Data_Analytics\04_Projektwoche\spacy_model\model-best")

# test a simple expression before predicting AirBnB-ratings
texts_good = ["this is a very beautiful place for a stay in this lovely city"]
texts_bad = ["A really, really terrible place "]

print("pedict rating for good texts")
for text in texts_good:
    doc = nlp(text)
    print(f"rating: {doc.cats} -- text: {text}")

print("pedict rating for bad texts")
for text in texts_bad:
    doc = nlp(text)
    print(f"rating: {doc.cats} -- text: {text}")


# finally predict the AirBnB ratings
prediction_sample["pedicted_rating"] = prediction_sample["comments"].apply(convert_to_rating, args=(nlp,))



pedict rating for good texts
rating: {'1': 0.16812527179718018, '2': 0.09771882742643356, '3': 0.05122455209493637, '4': 0.2534002959728241, '5': 0.429531067609787} -- text: this is a very beautiful place for a stay in this lovely city
pedict rating for bad texts
rating: {'1': 0.4252203702926636, '2': 0.2597227096557617, '3': 0.1788870096206665, '4': 0.08237838745117188, '5': 0.05379154160618782} -- text: A really, really terrible place 
