###Apply the train-test split to the dataset.

Split the dataset to train and test portitions for training the ML-models.

In [0]:
%run ../utils/run_target_helper

In [0]:
settings = get_settings(dbutils.widgets.get("TARGET"))

bronze_table = "verkkokauppa_reviews_bronze" + settings["table_suffix"]
upstream_table = "verkkokauppa_reviews_silver" + settings["table_suffix"]
downstream_table = "verkkokauppa_reviews_gold" + settings["table_suffix"]

Import libraries and dataset

In [0]:
import numpy as np
from sklearn.model_selection import train_test_split
from pyspark.sql.functions import col, when, max

df = spark.table(upstream_table)
df_bronze = spark.table(bronze_table)

data = df.select("id", "rating").toPandas()

Apply the train-test split. We only split the *id* column, not the data itself, and later use the labels to retrieve *train* or *test* portition when needed.
Since the dataset has strong imbalance in the rating values, we must use the **stratify** option to make sure the *train* and *test* portitions maintain similar
distributions.

In [0]:
id_train, id_test = train_test_split(
    data['id'].values, test_size=0.25, stratify=data['rating'].values
)

Add the *train-test* labels to the original dataset.

Add a new column *positive_review*. All reviews with rating 1, 2, 3 are labeled negative, and 4 and 5 are labeled positive. With this we can use binary classifiers
making the ML-model wrangling a little bit easier than using multiclass classifiers.

In [0]:
df = df.withColumn('train_test', when(col('id').isin(id_train.tolist()), 'train').otherwise('test'))
df = df.withColumn("positive_review", when(df['rating'] >= 4, 1).otherwise(0))

In [0]:
df.write.mode("overwrite").option("overwriteSchema", "True").format("delta").saveAsTable(downstream_table)
df.write.mode("overwrite").parquet(f"/tmp/{downstream_table}")

In [0]:
q = df_bronze.groupBy("language").count().orderBy(col("count"), ascending=False).select("count").collect()[0][0]
print(q)

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme()
fig = plt.figure(figsize=(12, 5))
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2)
ax3 = fig.add_subplot(1, 3, 3)

sns.histplot(data=df_bronze.toPandas(), x="language", ax=ax1)
sns.histplot(data=df.toPandas(), x="rating", hue="train_test", multiple="stack", ax=ax2)
sns.histplot(data=df_bronze.toPandas(), x="rating", hue="category", multiple="dodge", ax=ax3, bins=10)

ylim_max = df_bronze.groupBy("language").count().orderBy(col("count"), ascending=False).select("count").collect()[0][0]
for ax in [ax1, ax2, ax3]:
    ax.set_ylim(0, ylim_max)

ax2.set_ylabel("")
ax3.set_ylabel("")
ax2.set_yticklabels([])
ax3.set_yticklabels([])
fig.subplots_adjust(wspace=0.05)
fig.show()