In [1]:

!pip install -q --upgrade wandb

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

import wandb
import numpy as np
import matplotlib
matplotlib.use('Agg')

In [3]:
def log_dataset_artifact(train_path, test_path):
    try:
        train_data = pd.read_csv(train_path)
        test_data = pd.read_csv(test_path)
    except FileNotFoundError as e:
        print(f"Error: {e.strerror}. Please check paths and permissions.")
        return None, None

    


    raw_data_at = wandb.Artifact(
        params['RAW_DATA_AT'], 
        type="dataset",
        description="text dataset, split into train/test",
        metadata={"source": "kaggle",
                  "shapes": [train_data.shape, test_data.shape]}
    )
    raw_data_at.add_file(TRAIN_PATH, name="train.csv")
    raw_data_at.add_file(TEST_PATH, name="test.csv")
    
    train_table = wandb.Table(dataframe=train_data)
    test_table = wandb.Table(dataframe=test_data)
    raw_data_at.add(train_table, "train_eda_table")
    raw_data_at.add(test_table, "test_eda_table")
    run.log_artifact(raw_data_at)
    return train_data, test_data


In [4]:

def handle_missing_values(train_data, test_data):
    for data in [train_data, test_data]:
        data['started_at'].fillna('', inplace=True)
        data['read_at'].fillna('', inplace=True)

        data['started_at'] = pd.to_datetime(data['started_at'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')
        data['read_at'] = pd.to_datetime(data['read_at'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')

        out_of_bounds_rows = data[data['started_at'].isna() | data['read_at'].isna()]
        data.dropna(subset=['started_at', 'read_at'], inplace=True)
        data.reset_index(drop=True, inplace=True)

        data['time_taken_to_read'] = data['read_at'] - data['started_at']
        data['time_taken_to_read'] = data['time_taken_to_read'].astype(str)

        data['days_taken'] = data['time_taken_to_read'].apply(lambda x: int(x.split()[0]) if 'days' in x else 0)
                #number of reviews changes over time.
        data['date_added'] = pd.to_datetime(data['date_added'], format='%a %b %d %H:%M:%S %z %Y')
        data['date_updated'] = pd.to_datetime(data['date_updated'],  format='%a %b %d %H:%M:%S %z %Y')
        data['date_updated'] = data['date_updated'].astype(str)
        data['date_added'] = data['date_added'].astype(str)
        data['year_added'] = data['date_added'].str[:4]
        data['month_added'] = data['date_added'].str[5:7]

        data['year_updated'] = data['date_updated'].str[:4]
        data['month_updated'] = data['date_updated'].str[5:7]


        # Convert 'month_added' and 'month_updated' to integers
        data['year_updated'] = data['year_updated'].astype(int)
        data['month_updated'] = data['month_updated'].astype(int)


        # Convert 'month_added' and 'month_updated' to integers
        data['year_added'] = data['year_added'].astype(int)
        data['month_added'] = data['month_added'].astype(int)

    return train_data, test_data

In [5]:


def log_data(data, prefix):
    reviews_added_per_month = data.groupby(['year_added', 'month_added']).size().reset_index(name='reviews_added')

    # Group by 'year_updated' and 'month_updated', count the occurrences, and reset index
    reviews_updated_per_month = data.groupby(['year_updated', 'month_updated']).size().reset_index(name='reviews_updated')

    # Create a new column 'month-year'
    reviews_added_per_month['month-year'] = reviews_added_per_month.apply(lambda x: f"{x['month_added']}-{x['year_added']}", axis=1)
    reviews_updated_per_month['month-year'] = reviews_updated_per_month.apply(lambda x: f"{x['month_updated']}-{x['year_updated']}", axis=1)

    # Plotting bar plot
    plt.figure(figsize=(14, 6))
    plt.bar(reviews_added_per_month['month-year'], reviews_added_per_month['reviews_added'], color='blue', alpha=0.7, label=f'Reviews Added ({prefix.capitalize()} Data)')
    plt.bar(reviews_updated_per_month['month-year'], reviews_updated_per_month['reviews_updated'], color='orange', alpha=0.7, label=f'Reviews Updated ({prefix.capitalize()} Data)')
    plt.xlabel('Month-Year')
    plt.ylabel('Number of Reviews')
    plt.title(f'Comparison of Reviews Added and Updated per Month-Year ({prefix.capitalize()} Data)')
    plt.xticks(rotation=45)
    plt.gca().set_xticks(plt.gca().get_xticks()[::2])
    plt.legend()
    plt.tight_layout()

    # Save plot as image
    plot_filename = f"{prefix}_Comparison of Reviews Added and Updated per Month-Year_plot.png"
    plt.savefig(plot_filename)

    # Log the image to Weights & Biases
    wandb.log({f"number_of_reviews_changes_over_time_on_{prefix}_data": wandb.Image(plot_filename)})

    # Close the plot
    plt.close()
    plt.figure(figsize=(8, 6))
    plt.boxplot(data['days_taken'], vert=False)  # vert=False to plot horizontally
    plt.xlabel('Days Taken')
    plt.title('Box Plot of Days Taken before removing outlier' )

    # Save and log plot
    plot_filename = f"{prefix}_Days_Taken.png"
    plt.savefig(plot_filename)
    wandb.log({f"Box Plot of Days Taken{prefix}_data": wandb.Image(plot_filename)})

    # Show plot
    plt.show()
    plt.close()
    Q1 = np.percentile(data['days_taken'], 25)
    Q3 = np.percentile(data['days_taken'], 75)

    # Calculate IQR
    IQR = Q3 - Q1

    # Define lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out outliers
    filtered_data = [x for x in data['days_taken'] if x >= lower_bound and x <= upper_bound]

    # Plot histogram of filtered data
    plt.figure(figsize=(8, 6))
    plt.hist(filtered_data, bins=5, edgecolor='black')  # Adjust the number of bins as needed
    plt.xlabel('Days Taken')
    plt.ylabel('Frequency')
    plt.title('Histogram of Days Taken (without outliers)')
    plt.grid(True)
    

    # Save and log plot
    plot_filename = f"{prefix} Histogram of Days Taken (without outliers).png"
    plt.savefig(plot_filename)
    wandb.log({f"histogram Plot of Days Taken{prefix}_data": wandb.Image(plot_filename)})
    plt.show()



In [6]:
import matplotlib.pyplot as plt
import wandb

def log_histogram(data, column_name, title, prefix, log_to_wandb=True, log_to_matplotlib=True):

    column_values = data[column_name].values.tolist()
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.hist(column_values)
    ax.set_title(title)
    ax.set_xlabel(column_name)
    ax.set_ylabel('Frequency')
    ax.grid(True)
    
    if log_to_matplotlib:
        plt.show()
    filename = prefix + "_" + title + ".png"
    fig.savefig(filename)
    
    if log_to_wandb:
        wandb.log({f"{prefix}_{column_name}": wandb.Image(filename)})
    
    plt.close(fig)

def log_bar(data, x_column, y_column, title, prefix, log_to_wandb=True, log_to_matplotlib=True):
    x_values = data[x_column].values.tolist()
    y_values = data[y_column].values.tolist()
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.bar(x_values, y_values)
    ax.set_title(title)
    ax.set_xlabel(x_column)
    ax.set_ylabel(y_column)
    ax.grid(True)
    
    if log_to_matplotlib:
        plt.show()
    filename = prefix + "_" + title + ".png"
    fig.savefig(filename)
    
    if log_to_wandb:
        wandb.log({f"{prefix}_{x_column}_vs_{y_column}": wandb.Image(filename)})
    
    plt.close(fig)



In [7]:
def count_labels(data, column_name ):
    label_counts = data[column_name].value_counts().reset_index()
    label_counts.columns = ['label', 'count']
    return label_counts

def log_label_counts(data, column_name, prefix, title,log_to_wandb=True, log_to_matplotlib=False, save_path=None):
    label_counts = count_labels(data, column_name)
    plt.bar(label_counts['label'], label_counts['count'])
    plt.title("Distribution of Labels")
    plt.xlabel("Labels")
    plt.ylabel("Count")
    if log_to_matplotlib:
        plt.show()
    plt.savefig(title+".png")
    if log_to_wandb:
        wandb.log({f"{prefix}_label_counts": wandb.Image(title+".png")})
    plt.close()


In [8]:



# Example usage:
TRAIN_PATH = '/kaggle/input/goodreads-books-reviews-290312/goodreads_train.csv'
TEST_PATH = '/kaggle/input/goodreads-books-reviews-290312/goodreads_test.csv'
params = {'WANDB_PROJECT': 'review_classifier',
          'ENTITY': 'lilouuch',
          'CLASSES': {i: c for i, c in enumerate(range(0, 6))},
          'RAW_DATA_AT': 'Goodreads_Books_Review_Rating',
          'PROCESSED_DATA_AT': 'Goodreads_Books_Review_Rating_load'}



In [9]:

run =  wandb.init(project=params['WANDB_PROJECT'], entity=params['ENTITY'], job_type="upload")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [10]:
train_data, test_data=log_dataset_artifact(TRAIN_PATH, TEST_PATH)

In [11]:
train_data, test_data=handle_missing_values(train_data, test_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['started_at'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['read_at'].fillna('', inplace=True)
  data['started_at'] = pd.to_datetime(data['started_at'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')
  data['read_at'] = pd.to_datetime(dat

In [12]:
log_data(train_data,'train')

In [13]:
train_data['text_length_train'] = train_data['review_text'].apply(lambda x: len(x))
test_data['text_length_test'] = test_data['review_text'].apply(lambda x: len(x))

In [14]:
log_histogram(train_data, 'text_length_train', 'Distribution of Review Text Lengths (Train Data)', 'train')
log_histogram(test_data, 'text_length_test', 'Distribution of Review Text Lengths (Test Data)', 'test')

log_bar(train_data, 'rating', 'n_comments', 'Rating vs Number of Comments (Train Data)', 'train')
log_bar(train_data, 'rating', 'n_votes', 'Rating vs Number of Votes (Train Data)', 'train')


In [15]:
train_label_counts = count_labels(train_data, 'rating')

log_label_counts(train_data, 'rating', 'train', 'distribution of labels')
