# Importing libraries and loading data

In [None]:
!sudo apt install python3-p

[sudo] password for jaskaran: 

In [2]:
# Import necessary libraries
import numpy as np  # Import NumPy for handling numerical operations
import pandas as pd  # Import Pandas for data manipulation and analysis
import warnings  # Import Warnings to suppress unnecessary warnings

# Suppress warning messages
warnings.filterwarnings("ignore")

# Import SHAP for interpreting model predictions
import shap

# Import matplotlib for data visualization
import matplotlib.pyplot as plt

# Import CatBoostRegressor for building a regression model
from catboost import Pool, CatBoostRegressor

# Import mean_squared_error for evaluating model performance
from sklearn.metrics import mean_squared_error

# Import train_test_split for splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

# Import RareLabelEncoder from feature_engine.encoding for encoding categorical features
from feature_engine.encoding import RareLabelEncoder

# Import CountVectorizer from sklearn.feature_extraction.text for text feature extraction
from sklearn.feature_extraction.text import CountVectorizer

# Import ast and re for working with text and regular expressions
import ast
import re

# Set Pandas options to display a maximum of 1000 rows
pd.set_option('display.max_rows', 1000)

ModuleNotFoundError: No module named 'pandas'

In [None]:
%%time
df = pd.read_csv('/kaggle/input/huggingface-co-model-catalogue/Models.csv') # Reads the dataset from a CSV file into a Pandas DataFrame
item0 = df.shape[0]  # Stores the initial number of rows in the DataFrame
df = df.drop_duplicates()  # Removes duplicate rows from the DataFrame
item1 = df.shape[0]  # Stores the number of rows after removing duplicates
print(f"There are {item0-item1} duplicates found in the dataset")  # Prints the number of duplicates that were removed

In [None]:
# check for my own results there 
author_name = "dima806"
author_df = df[df['author']==author_name]
print(author_df.shape)
author_df.sample(5).T

In [None]:
author_df.describe().T

In [None]:
# log10-transform downloads
df['log10_downloads'] = df['downloads'].apply(lambda x: np.log10(1+x))
df = df[~df['log10_downloads'].isnull()]
    
df['lastModified_year'] = pd.to_datetime(df['lastModified']).dt.year


# Select only specific columns of interest
selected_cols = ['log10_downloads', 'author', 'gated',
       'authorData.type', 'authorData.isPro', 'authorData.isHf',
       'pipeline_tag', 'lastModified_year']
df = df[selected_cols]

print(df.shape)  # Prints the dimensions (rows and columns) of the filtered DataFrame
df.sample(10).T  # Displays a random sample of 5 rows transposed for better visibility

In [None]:
df.columns

In [None]:
df.nunique()

In [None]:
df.describe().T

# Data transformation

In [None]:
# Accessing DataFrame columns
# This line of code retrieves the column names from a DataFrame called 'df'.
# It allows you to access and work with the names of the columns in the DataFrame.

df.columns

In [None]:
df.sample(5).T

In [None]:
%%time

# Select the main label.
main_label = 'log10_downloads'

# Set up a rare label encoder for selected columns.
for col in df.columns:
    if col != main_label:
        df[col] = df[col].fillna('None').astype(str)
        encoder = RareLabelEncoder(n_categories=1, max_n_categories=500, replace_with='Other', tol=20.0 / df.shape[0])
        df[col] = encoder.fit_transform(df[[col]])

print(df.shape)  # Print the shape of the resulting DataFrame.
df.sample(10).T  # Display a sample of 10 rows, transposed for easier readability.

In [None]:
df.info()

# Machine learning

In [None]:
%%time
# Initialize data
# Extract the values of the 'main_label' column and reshape it into a 1D array as 'y'
y = df[main_label].values.reshape(-1,)
# Create the feature matrix 'X' by dropping the 'main_label' column from the DataFrame 'df'
X = df.drop([main_label], axis=1)

# Identify categorical columns in the DataFrame 'df'
# These columns contain non-numeric data
cat_cols = df.select_dtypes(include=['object']).columns

# Create a list of indices for categorical columns in the feature matrix 'X'
cat_cols_idx = [list(X.columns).index(c) for c in cat_cols]

# Split the data into training and testing sets
# - 'X_train' and 'y_train' will contain the training features and labels, respectively
# - 'X_test' and 'y_test' will contain the testing features and labels, respectively
# The split is done with a 50% test size, a random seed of 0, and stratification based on the selected column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0, stratify=df[['author']])

# Print the dimensions of the training and testing sets
# This provides insight into the sizes of the datasets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
%%time

# Initialize the training and testing data pools using CatBoost's Pool class
train_pool = Pool(X_train, 
                  y_train, 
                  cat_features=cat_cols_idx)  # Create a training data pool with categorical features
test_pool = Pool(X_test,
                 y_test,
                 cat_features=cat_cols_idx)  # Create a testing data pool with categorical features

# Specify the training parameters for the CatBoostRegressor model
model = CatBoostRegressor(iterations=1500,    # Number of boosting iterations
                          depth=5,           # Maximum depth of trees in the ensemble
                          verbose=0,         # Set verbosity level to 0 (no output during training)
                          learning_rate=0.08,  # Learning rate for gradient boosting
                          early_stopping_rounds=100, # Early stopping rounds
                          loss_function='RMSE')  # Loss function to optimize (Root Mean Squared Error)

# Train the CatBoostRegressor model on the training data
model.fit(train_pool, eval_set=test_pool)

# Make predictions using the trained model on both the training and testing data
y_train_pred = model.predict(train_pool)  # Predictions on the training data
y_test_pred = model.predict(test_pool)    # Predictions on the testing data

# Calculate and print the Root Mean Squared Error (RMSE) scores for training and testing data
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)  # RMSE for training data
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)     # RMSE for testing data

# Print the rounded RMSE scores
print(f"RMSE score for train {round(rmse_train, 4)} dex, and for test {round(rmse_test, 4)} dex")

In [None]:
# Calculate the baseline RMSE (Root Mean Squared Error) scores for the training and test datasets.

# For the training dataset:

# Calculate the RMSE by comparing the actual target values (y_train) with the predicted values,
# where the predicted values are the mean of the training target values repeated for each data sample.
rmse_bs_train = mean_squared_error(y_train, [np.mean(y_train)]*len(y_train), squared=False)

# For the test dataset:

# Calculate the RMSE by comparing the actual target values (y_test) with the predicted values,
# where the predicted values are the mean of the training target values repeated for each test data sample.
rmse_bs_test = mean_squared_error(y_test, [np.mean(y_train)]*len(y_test), squared=False)

# Print the rounded baseline RMSE scores for both the training and test datasets.
print(f"RMSE baseline score for train {round(rmse_bs_train, 4)} dex, and for test {round(rmse_bs_test, 4)} dex")

# Explanations with SHAP values

In [None]:
%matplotlib inline

In [None]:
%%time
# Initialize the SHAP library for visualization
shap.initjs()

# Create a TreeExplainer object for the 'model' (assumes 'model' is a tree-based model like a Random Forest or XGBoost)
ex = shap.TreeExplainer(model)

# Calculate SHAP values for the 'X_test' data using the TreeExplainer
shap_values = ex.shap_values(X_test)

# Generate a summary plot to visualize the impact of features on model predictions
shap.summary_plot(shap_values, X_test)

In [None]:
# Calculate the expected values using a variable named 'ex.expected_value'.
expected_values = ex.expected_value

# Print the average predicted label
print(f"Average predicted downloads is {round(10**expected_values):,}")

# Print the average actual label from 'y_test'
print(f"Average actual downloads is {round(10**np.mean(y_test)):,}")

In [None]:
# Define a function named 'show_shap' that visualizes SHAP values for a specific feature.
# Parameters:
#   - col: The name of the feature for which SHAP values will be visualized.
#   - shap_values: SHAP values calculated for the model's predictions.
#   - label: The label to be displayed in the plot title.
#   - X_test: The DataFrame containing the test data.
#   - ylabel: The label for the y-axis in the plot (default is 'points').
def show_shap(col, shap_values=shap_values, label=main_label, X_test=X_test, ylabel='dex'):
    # Create a copy of the test data DataFrame.
    df_infl = X_test.copy()
    
    # Add a new column 'shap_' to the DataFrame containing SHAP values for the specified feature.
    df_infl['shap_'] = shap_values[:, df_infl.columns.tolist().index(col)]
    
    # Calculate the mean SHAP values and standard deviation grouped by the specified feature.
    gain = round(df_infl.groupby(col)['shap_'].mean(), 4)
    gain_std = round(df_infl.groupby(col)['shap_'].std(), 4)
    
    # Count the number of data points for each category of the specified feature.
    cnt = df_infl.groupby(col)['shap_'].count()
    
    # Create a dictionary containing the feature, mean SHAP values, standard deviation, and count.
    dd_dict = {'col': list(gain.index), 'gain': list(gain.values), 'gain_std': list(gain_std.values), 'count': cnt}
        
    # Create a DataFrame from the dictionary and sort it by 'gain' in descending order.
    df_res = pd.DataFrame.from_dict(dd_dict).sort_values('gain', ascending=False).set_index('col')
    
    # Replace "$" by "*" to use in matplotlib
    if any([('$' in str(idx)) for idx in set(df_res.index)]):
        print('Replacing "$" by "*" to use in matplotlib')
        df_res.index = [c.replace('$','*') for c in df_res.index]
    
    
    # Create a plot to visualize the SHAP values with error bars.
    plt.figure(figsize=(30, 20))
    plt.errorbar(df_res.index, df_res['gain'].values, yerr=df_res['gain_std'].values, fmt="o", color="r")
    
    # Set plot title and axis labels.
    plt.title(f'SHAP values for column {col}, label {label}')
    plt.ylabel(ylabel)
    plt.tick_params(axis="x", rotation=90)
    
    # Display the plot and the DataFrame with results.
    plt.show()
    print(df_res)
    
    # Return the function.
    return

# Loop through the columns in the test data.
for col in X_test.columns:
    print()  # Print an empty line for better readability.
    print(col)  # Print the name of the current column.
    print()  # Print another empty line for separation.

    # Call the 'show_shap' function to visualize SHAP values for the current column.
    show_shap(col, shap_values, label=main_label, X_test=X_test)