In [None]:
import pandas as pd
    2 import dask.dataframe as dd # Import Dask DataFrame
    3 import os
    4 import matplotlib.pyplot as plt
    5 import seaborn as sns
    6 
    7 # --- Configuration ---
    8 AMAZON_DATA_DIR = './data/amazon/'
    9 CATEGORY_TO_ANALYZE = 'Electronics' # You can change this to any category 
      you've downloaded
   10 REVIEW_FILE = os.path.join(AMAZON_DATA_DIR, f'{CATEGORY_TO_ANALYZE}
      _reviews.parquet')
   11 
   12 print(f"Analyzing review data for category: {CATEGORY_TO_ANALYZE}")
   13 print(f"Loading data from: {REVIEW_FILE}")
   14 
   15 # Load the review data using Dask
   16 try:
   17     # Dask reads Parquet efficiently. It creates a Dask DataFrame, not loading 
      into memory yet.
   18     reviews_ddf = dd.read_parquet(REVIEW_FILE)
   19     print(f"Successfully created Dask DataFrame for {CATEGORY_TO_ANALYZE} 
      reviews.")
   20     print("Dask DataFrame head (first few rows, computed):")
   21     print(reviews_ddf.head())
   22     print("Dask DataFrame info:")
   23     reviews_ddf.info()
   24 except FileNotFoundError:
   25     print(f"Error: Review file not found at {REVIEW_FILE}. Please ensure you 
      have downloaded and processed the data.")
   26     reviews_ddf = None
   27 
   28 if reviews_ddf is not None:
   29     # --- Sparsity Calculation with Dask ---
   30     # Assuming 'reviewerID' is user ID and 'asin' is item ID
   31     user_col = 'reviewerID'
   32     item_col = 'asin'
   33 
   34     # Dask operations are lazy. .compute() triggers the actual calculation.
   35     unique_users = reviews_ddf[user_col].nunique().compute()
   36     unique_items = reviews_ddf[item_col].nunique().compute()
   37     actual_interactions = len(reviews_ddf) # len() on Dask DataFrame is also 
      lazy, but fast for row count
   38 
   39     total_possible_interactions = unique_users * unique_items
   40     sparsity = 1 - (actual_interactions / total_possible_interactions)
   41 
   42     print(f"\n--- Sparsity Analysis for {CATEGORY_TO_ANALYZE} ---")
   43     print(f"Number of unique users: {unique_users}")
   44     print(f"Number of unique items: {unique_items}")
   45     print(f"Number of actual interactions: {actual_interactions}")
   46     print(f"Total possible interactions (users * items): 
      {total_possible_interactions}")
   47     print(f"Sparsity: {sparsity:.4f} ({sparsity:.2%})\n")
   48 
   49     # --- Basic Visualizations (requires loading data into pandas for plotting)
      ---
   50     # For very large datasets, you might sample or compute histograms with Dask
      directly
   51     # For now, we'll compute value counts with Dask and then plot with 
      pandas/matplotlib
   52 
   53     plt.figure(figsize=(12, 5))
   54 
   55     plt.subplot(1, 2, 1)
   56     reviews_ddf[user_col].value_counts().compute().hist(bins=50)
   57     plt.title('Interactions per User')
   58     plt.xlabel('Number of Interactions')
   59     plt.ylabel('Number of Users')
   60     plt.yscale('log') # Log scale for better visibility of long tail
   61 
   62     plt.subplot(1, 2, 2)
   63     reviews_ddf[item_col].value_counts().compute().hist(bins=50)
   64     plt.title('Interactions per Item')
   65     plt.xlabel('Number of Interactions')
   66     plt.ylabel('Number of Items')
   67     plt.yscale('log') # Log scale for better visibility of long tail
   68 
   69     plt.tight_layout()
   70     plt.show()
   71 
   72     # --- Check for 'overall' rating column ---
   73     if 'overall' in reviews_ddf.columns:
   74         print("\nDistribution of 'overall' ratings:")
   75         # Compute value counts with Dask, then convert to pandas Series for 
      printing
   76         print(reviews_ddf['overall'].value_counts(normalize=True
      ).compute().sort_index())
   77         plt.figure(figsize=(6, 4))
   78         # For plotting, we might need to compute a smaller representation or 
      sample
   79         sns.countplot(x='overall', data=reviews_ddf[['overall']].compute())
   80         plt.title('Distribution of Overall Ratings')
   81         plt.xlabel('Rating')
   82         plt.ylabel('Count')
   83         plt.show()
   84     else:
   85         print("\n'overall' rating column not found. Assuming implicit 
      feedback.")


outputs": [],
    58    "source": [
    59     "if reviews_ddf is not None:\n",
    60     "    # --- Sparsity Calculation with Dask ---\n",
    61     "    # Assuming 'reviewerID' is user ID and 'asin' is item ID\n",
    62     "    user_col = 'reviewerID'\n",
    63     "    item_col = 'asin'\n",
    64     "\n",
    65     "    # Dask operations are lazy. .compute() triggers the actual 
       calculation.\n",
    66     "    unique_users = reviews_ddf[user_col].nunique().compute()\n",
    67     "    unique_items = reviews_ddf[item_col].nunique().compute()\n",
    68     "    actual_interactions = len(reviews_ddf) # len() on Dask DataFrame is 
       also lazy, but fast for row count\n",
    69     "\n",
    70     "    total_possible_interactions = unique_users * unique_items\n",
    71     "    sparsity = 1 - (actual_interactions / total_possible_interactions)\n"
       ,
    72     "\n",
    73     "    print(f\"\\n--- Sparsity Analysis for {CATEGORY_TO_ANALYZE} ---\")\n"
       ,
    74     "    print(f\"Number of unique users: {unique_users}\")\n",
    75     "    print(f\"Number of unique items: {unique_items}\")\n",
    76     "    print(f\"Number of actual interactions: {actual_interactions}\")\n",
    77     "    print(f\"Total possible interactions (users * items): 
       {total_possible_interactions}\")\n",
    78     "    print(f\"Spimport pandas as pd
    2 import dask.dataframe as dd # Import Dask DataFrame
    3 import os
    4 import matplotlib.pyplot as plt
    5 import seaborn as sns
    6 
    7 # --- Configuration ---
    8 AMAZON_DATA_DIR = './data/amazon/'
    9 CATEGORY_TO_ANALYZE = 'Electronics' # You can change this to any category 
      you've downloaded
   10 REVIEW_FILE = os.path.join(AMAZON_DATA_DIR, f'{CATEGORY_TO_ANALYZE}
      _reviews.parquet')
   11 
   12 print(f"Analyzing review data for category: {CATEGORY_TO_ANALYZE}")
   13 print(f"Loading data from: {REVIEW_FILE}")
   14 
   15 # Load the review data using Dask
   16 try:
   17     # Dask reads Parquet efficiently. It creates a Dask DataFrame, not loading 
      into memory yet.
   18     reviews_ddf = dd.read_parquet(REVIEW_FILE)
   19     print(f"Successfully created Dask DataFrame for {CATEGORY_TO_ANALYZE} 
      reviews.")
   20     print("Dask DataFrame head (first few rows, computed):")
   21     print(reviews_ddf.head())
   22     print("Dask DataFrame info:")
   23     reviews_ddf.info()
   24 except FileNotFoundError:
   25     print(f"Error: Review file not found at {REVIEW_FILE}. Please ensure you 
      have downloaded and processed the data.")
   26     reviews_ddf = None
   27 
   28 if reviews_ddf is not None:
   29     # --- Sparsity Calculation with Dask ---
   30     # Assuming 'reviewerID' is user ID and 'asin' is item ID
   31     user_col = 'reviewerID'
   32     item_col = 'asin'
   33 
   34     # Dask operations are lazy. .compute() triggers the actual calculation.
   35     unique_users = reviews_ddf[user_col].nunique().compute()
   36     unique_items = reviews_ddf[item_col].nunique().compute()
   37     actual_interactions = len(reviews_ddf) # len() on Dask DataFrame is also 
      lazy, but fast for row count
   38 
   39     total_possible_interactions = unique_users * unique_items
   40     sparsity = 1 - (actual_interactions / total_possible_interactions)
   41 
   42     print(f"\n--- Sparsity Analysis for {CATEGORY_TO_ANALYZE} ---")
   43     print(f"Number of unique users: {unique_users}")
   44     print(f"Number of unique items: {unique_items}")
   45     print(f"Number of actual interactions: {actual_interactions}")
   46     print(f"Total possible interactions (users * items): 
      {total_possible_interactions}")
   47     print(f"Sparsity: {sparsity:.4f} ({sparsity:.2%})\n")
   48 
   49     # --- Basic Visualizations (requires loading data into pandas for plotting)
      ---
   50     # For very large datasets, you might sample or compute histograms with Dask
      directly
   51     # For now, we'll compute value counts with Dask and then plot with 
      pandas/matplotlib
   52 
   53     plt.figure(figsize=(12, 5))
   54 
   55     plt.subplot(1, 2, 1)
   56     reviews_ddf[user_col].value_counts().compute().hist(bins=50)
   57     plt.title('Interactions per User')
   58     plt.xlabel('Number of Interactions')
   59     plt.ylabel('Number of Users')
   60     plt.yscale('log') # Log scale for better visibility of long tail
   61 
   62     plt.subplot(1, 2, 2)
   63     reviews_ddf[item_col].value_counts().compute().hist(bins=50)
   64     plt.title('Interactions per Item')
   65     plt.xlabel('Number of Interactions')
   66     plt.ylabel('Number of Items')
   67     plt.yscale('log') # Log scale for better visibility of long tail
   68 
   69     plt.tight_layout()
   70     plt.show()
   71 
   72     # --- Check for 'overall' rating column ---
   73     if 'overall' in reviews_ddf.columns:
   74         print("\nDistribution of 'overall' ratings:")
   75         # Compute value counts with Dask, then convert to pandas Series for 
      printing
   76         print(reviews_ddf['overall'].value_counts(normalize=True
      ).compute().sort_index())
   77         plt.figure(figsize=(6, 4))
   78         # For plotting, we might need to compute a smaller representation or 
      sample
   79         sns.countplot(x='overall', data=reviews_ddf[['overall']].compute())
   80         plt.title('Distribution of Overall Ratings')
   81         plt.xlabel('Rating')
   82         plt.ylabel('Count')
   83         plt.show()
   84     else:
   85         print("\n'overall' rating column not found. Assuming implicit 
      feedback.")
arsity: {sparsity:.4f} ({sparsity:.2%})\\n\")\n",
    79     "\n",
    80     "    # --- Basic Visualizations (requires loading data into pandas for 
       plotting) ---\n",
    81     "    # For very large datasets, you might sample or compute histograms 
       with Dask directly\n",
    82     "    # For now, we'll compute value counts with Dask and then plot with 
       pandas/matplotlib\n",
    83     "\n",
    84     "    plt.figure(figsize=(12, 5))\n",
    85     "\n",
    86     "    plt.subplot(1, 2, 1)\n",
    87     "    reviews_ddf[user_col].value_counts().compute().hist(bins=50)\n",
    88     "    plt.title('Interactions per User')\n",
    89     "    plt.xlabel('Number of Interactions')\n",
    90     "    plt.ylabel('Number of Users')\n",
    91     "    plt.yscale('log') # Log scale for better visibility of long tail\n",
    92     "\n",
    93     "    plt.subplot(1, 2, 2)\n",
    94     "    reviews_ddf[item_col].value_counts().compute().hist(bins=50)\n",
    95     "    plt.title('Interactions per Item')\n",
    96     "    plt.xlabel('Number of Interactions')\n",
    97     "    plt.ylabel('Number of Items')\n",
    98     "    plt.yscale('log') # Log scale for better visibility of long tail\n",
    99     "\n",
   100     "    plt.tight_layout()\n",
   101     "    plt.show()\n",
   102     "\n",
   103     "    # --- Check for 'overall' rating column ---\n",
   104     "    if 'overall' in reviews_ddf.columns:\n",
   105     "        print(\"\\nDistribution of 'overall' ratings:\")\n",
   106     "        # Compute value counts with Dask, then convert to pandas Series 
       for printing\n",
   107     "        
       print(reviews_ddf['overall'].value_counts(normalize=True).compute().sort_index
       ())\n",
   108     "        plt.figure(figsize=(6, 4))\n",
   109     "        # For plotting, we might need to compute a smaller representation
       or sample\n",
   110     "        sns.countplot(x='overall', 
       data=reviews_ddf[['overall']].compute())\n",
   111     "        plt.title('Distribution of Overall Ratings')\n",
   112     "        plt.xlabel('Rating')\n",
   113     "        plt.ylabel('Count')\n",
   114     "        plt.show()\n",
   115     "    else:\n",
   116     "        print(\"\\n'overall' rating column not found. Assuming implicit 
       feedback.\")"
   117    ]
   118   }
   119  ],
   120  "metadata": {
   121   "kernelspec": {
   122    "display_name": "Python 3",
   123    "language": "python",
   124    "name": "python3"
   125   },
   126   "language_info": {
   127    "codemirror_mode": {
   128     "name": "ipython",
   129     "version": 3
   130    },
   131    "file_extension": ".py",
   132    "mimetype": "text/x-python",
   133    "name": "python",
   134    "nbconvert_exporter": "python",
   135    "pygments_lexer": "ipython3",
   136    "version": "3.9.18"
   137   }
   138  },
   139  "nbformat": 4,
   140  "nbformat_minor": 4
   141 }
