## Importing Packages

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
import sys
import yaml
import zipfile
from kaggle.api.kaggle_api_extended import KaggleApi


project_root = os.path.abspath("..")  # Go up one level to reach the project root
sys.path.append(project_root)
from src.cleaning.pre_processing_class import PreProcessing
from src.analysis.visualisation_class import Visualisation

In [None]:
with open("../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

In [None]:
api = KaggleApi()
api.authenticate()

api.dataset_download_file(
    "snap/amazon-fine-food-reviews",
    file_name="Reviews.csv",
    path=config["data_raw_folder"],
)


stem_path = config["data_raw_folder"] + "Reviews.csv.zip"
with zipfile.ZipFile(stem_path, "r") as zipref:
    zipref.extractall(config["data_raw_folder"])

In [None]:
stem_path = config["data_raw_folder"] + "Reviews.csv"
df = pd.read_csv(filepath_or_buffer=stem_path, encoding="latin")

### PreProcessing Class

In [None]:
p = PreProcessing(df)

In [None]:
# takes around 2 minutes
p.get_dataframe_report()

In [None]:
# removes trailing and leading white space, also lowercases
p.clean_column_names()

# removes any duplicates and returns number of duplicates dropped
p.remove_duplicates()

In [None]:
# lowercases and removes trailing/leading white space in user defined columns
p.lowercase_strip_rows(
    columns_to_clean=[
        "text",
        "summary",
    ]
)

# This method can either fill in missing values or remove them all together either by row or column
p.fill_or_remove_missing_values(
    replacement_dict={"profilename": "missing", "summary": "missing"}
)

# This method can convert the entire columns datatype to a string, boolean, integer, category or float
p.convert_datatype(column_types={})  # {'app_version_code': "integer"})

# This method can convert a column to datetime datatype
p.convert_to_datetime(replacement_dict={"time": "s"})

# This method can take a datetime column extract specific date information and creates a new column with the new specific date info. The new column wont be in datetime format
final_df = p.extract_date_info(
    date_column=["time"],
    replacement_dict={
        "date": True,
        "strftime": True,
        "day_name": True,
        "custom": "%Y-%m",
    },
)

In [None]:
# Create binary "Good/Bad" rating
def bin_rating(x):
    if x > 3:
        return "Good"
    elif x == 3:
        return "Mid"
    elif x < 3:
        return "Bad"
    else:
        return "Missing"


final_df["binary_rating"] = final_df["score"].apply(bin_rating)

In [None]:
final_df

In [None]:
final_df["helpfullness_%"] = (
    final_df["helpfulnessnumerator"] / final_df["helpfulnessdenominator"]
) * 100

## Visualisation Class

In [None]:
v = Visualisation(final_df)

In [None]:
# v.describe_columns() #this causes issues :( maybe cause the dataset is so big>?)

In [None]:
v.plot_count_and_proportion(
    columns=["score", "helpfulnessnumerator", "helpfulnessdenominator"], dropna=False
)

In [None]:
grouped_df = final_df.groupby(["time"])["score"].count().reset_index()

v.custom_graph(
    grouped_df,
    x_column="time",
    y_column_and_type={"score": "line"},
    xaxis_type="date",
    barmode=None,
    z_column=None,
    graph_title="count of scores over time",
    x_axes_title="time (days)",
    y_axes_title="count of scores",
    yaxis_range=None,
    xaxis_range=None,
)

In [None]:
grouped_df = (
    final_df.groupby(["time", "score"])
    .agg({"score": "count"})
    .rename(columns={"score": "count of score"})
    .reset_index()
)

v.custom_graph(
    df=grouped_df,
    x_column="time",
    y_column_and_type={"count of score": "line"},
    xaxis_type="date",
    barmode=None,
    z_column="score",
    graph_title="count of scores over time",
    x_axes_title="time (days)",
    y_axes_title="avg of scores",
    yaxis_range=None,
    xaxis_range=["2008-01-01", "2010-01-01"],
)

In [None]:
grouped = (
    final_df.groupby("time")["score"]
    .agg(["count", lambda x: (x == 5).sum()])
    .reset_index()
)
grouped.columns = ["date", "total_count", "count_5"]
grouped["percentage_5"] = (grouped["count_5"] / grouped["total_count"]) * 100


v.custom_graph(
    df=grouped,
    x_column="date",
    y_column_and_type={"total_count": "bar", "percentage_5": "line"},
    xaxis_type="date",
    barmode=None,
    z_column=None,
    graph_title="put in graph title",
    x_axes_title="time (days)",
    y_axes_title=None,
    yaxis_range=None,
    xaxis_range=["2008-01-01", "2009-01-01"],
)

In [None]:
# z column none but different lines
grouped = (
    final_df.groupby(["time", "binary_rating"])
    .agg({"binary_rating": "count"})
    .rename(columns={"binary_rating": "binary_rating_count"})
    .reset_index()
)

v.custom_graph(
    df=grouped,
    x_column="time",
    y_column_and_type={"binary_rating_count": "line"},
    xaxis_type="date",
    barmode=None,
    z_column="binary_rating",
    graph_title="put in graph title",
    x_axes_title="time (days)",
    y_axes_title=None,
    yaxis_range=None,
    xaxis_range=["2008-01-01", "2009-01-01"],
)

In [None]:
grouped = (
    final_df.groupby(["time_day_name", "binary_rating"])
    .agg({"binary_rating": "count"})
    .rename(columns={"binary_rating": "binary_rating_count"})
    .reset_index()
)

v.custom_graph(
    df=grouped,
    x_column="time_day_name",
    y_column_and_type={"binary_rating_count": "bar"},
    xaxis_type="category",
    barmode="group",
    z_column="binary_rating",
    graph_title="This is the graph title",
    x_axes_title="time (days)",
    y_axes_title=None,
    yaxis_range=None,
    xaxis_range=None,
)

In [None]:
p.fill_or_remove_missing_values(replacement_dict={"summary": "remove_row"})

In [None]:
v_good = Visualisation(df=final_df.loc[final_df["binary_rating"] == "Good"])
v_bad = Visualisation(df=final_df.loc[final_df["binary_rating"] == "Bad"])
v_mid = Visualisation(df=final_df.loc[final_df["binary_rating"] == "Mid"])

In [None]:
v_bad.create_wordcloud(text="summary", remove_words=[""])

In [None]:
v_mid.create_wordcloud(text="summary", remove_words=[""])

In [None]:
v_good.create_wordcloud(text="summary", remove_words=[""])