In [33]:
# Import neccessary modules
import pandas as pd
import numpy as np
import json
from pathlib import Path

In [34]:
# Set dataset path to RFdataset folder
dataset_path = Path("RFdataset")
print("Files in dataset:")
for file in dataset_path.iterdir():
    print(f"  - {file.name} ({file.stat().st_size / (1024*1024):.2f} MB)")


Files in dataset:
  - reddit_data_counts.json (0.00 MB)
  - reddit_dataset.json (11.27 MB)


In [35]:
# Explore JSON structure from reddit_dataset.json
json_file = dataset_path / "reddit_dataset.json"

if json_file.exists():
    print(f"Loading: {json_file.name}")
    print(f"File size: {json_file.stat().st_size / (1024*1024):.2f} MB")
    
    # Load and explore structure
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        print("\nTop-level keys:", list(data.keys()))
        
        # Check if it has a "posts" key
        if 'posts' in data:
            print(f"\nFound 'posts' array with {len(data['posts'])} items")
            if len(data['posts']) > 0:
                sample_post = data['posts'][0]
                print("\nSample post structure:")
                print(json.dumps(sample_post, indent=2)[:1000])  # First 1000 chars
                print("\nKeys in post:", list(sample_post.keys()))
        else:
            print("\nFull data structure (first 1000 chars):")
            print(json.dumps(data, indent=2)[:1000])
else:
    print(f"File not found: {json_file}")
    print("Available files:")
    for file in dataset_path.iterdir():
        print(f"  - {file.name}")


Loading: reddit_dataset.json
File size: 11.27 MB

Top-level keys: ['posts']

Found 'posts' array with 6187 items

Sample post structure:
{
  "title": "Which country in the world suffers most from wage inequality and why?",
  "body": "Shall we discuss this topic in the comments? I'm curious to hear your opinions. I have written my own thoughts below.\r  \n\r  \nMany sources and studies highlight countries like Brazil, South Africa, India, and the United States as standing out in terms of income inequality. Inequality factors in these countries can include high income inequality, challenging working conditions faced by low-wage workers, racial or ethnic discrimination, gender inequality, and social class disparities.\r  \n\r  \nThe causes of income inequality in these countries can be complex and multifaceted. For example, high income inequality can sometimes reflect a wide economic gap between the rich and the poor. Challenging working conditions experienced by low-wage workers can aris

In [36]:
# Load JSON data from reddit_dataset.json
json_file = dataset_path / "reddit_dataset.json"

print(f"Loading {json_file.name}...")
with open(json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)
    
    # Extract posts array
    if 'posts' in data:
        raw_data = data['posts']
        print(f"Loaded {len(raw_data)} posts from 'posts' array")
    elif isinstance(data, list):
        raw_data = data
        print(f"Loaded {len(raw_data)} items from JSON array")
    else:
        # If it's a single object, wrap it in a list
        raw_data = [data]
        print(f"Loaded 1 item from JSON object")

print(f"\nTotal records loaded: {len(raw_data)}")


Loading reddit_dataset.json...
Loaded 6187 posts from 'posts' array

Total records loaded: 6187


In [37]:
# Convert to DataFrame and explore
df = pd.DataFrame(raw_data)
print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()


DataFrame shape: (6187, 6)

Columns: ['title', 'body', 'url', 'post_score', 'comment', 'comment_score']

First few rows:


Unnamed: 0,title,body,url,post_score,comment,comment_score
0,Which country in the world suffers most from w...,Shall we discuss this topic in the comments? I...,https://www.reddit.com/r/business/comments/14e...,3,"Close your eyes, and you can choose one of the...",5
1,Passion,Does your work drive you? Or is it something y...,https://www.reddit.com/r/business/comments/14e...,1,"Wow, you and I are the same person. Haha, exce...",1
2,Biz Savings Interest Rates,I‚Äôm assuming the answer is obviously that the ...,https://www.reddit.com/r/business/comments/14e...,2,"I think your assumption is correct, businesses...",1
3,How much is international ocean freight?,,https://www.reddit.com/r/business/comments/14e...,1,Way too vague to be answered.\nFrom where to w...,1
4,Hello everyone I want to start a low budget bu...,,https://www.reddit.com/r/business/comments/14e...,1,Thanks üôè,2


In [41]:
# Check data types and basic stats
print("Data types:")
print(df.dtypes)
print("\nBasic statistics:")
df.describe()


Data types:
title            object
body             object
url              object
post_score        int64
comment          object
comment_score     int64
dtype: object

Basic statistics:


Unnamed: 0,post_score,comment_score
count,6187.0,6187.0
mean,517.082916,191.261516
std,1970.486668,1102.608781
min,0.0,-1547.0
25%,1.0,4.0
50%,13.0,16.0
75%,315.0,106.0
max,42256.0,26998.0


In [43]:
# Check for outliers
numeric_cols = ['post_score', 'comment_score']
for col in numeric_cols:
    if col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        print(f"\n{col}:")
        print(f"  Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
        print(f"  Outlier bounds: [{lower_bound:.2f}, {upper_bound:.2f}]")
        print(f"  Number of outliers: {len(outliers)} ({len(outliers)/len(df)*100:.2f}%)")
        print(f"  Min: {df[col].min()}, Max: {df[col].max()}")


post_score:
  Q1: 1.00, Q3: 315.00, IQR: 314.00
  Outlier bounds: [-470.00, 786.00]
  Number of outliers: 702 (11.35%)
  Min: 0, Max: 42256

comment_score:
  Q1: 4.00, Q3: 106.00, IQR: 102.00
  Outlier bounds: [-149.00, 259.00]
  Number of outliers: 692 (11.18%)
  Min: -1547, Max: 26998


In [53]:
# Create features from raw data for Random Forest
def extract_features(df):
    features_df = df.copy()
    
    # 1. Extract subreddit from URL
    if 'url' in features_df.columns:
        features_df['subreddit'] = features_df['url'].astype(str).str.extract(r'/r/([^/]+)/')
    
    # 2. Text-based features (combine title and body)
    if 'title' in features_df.columns and 'body' in features_df.columns:
        features_df['combined_text'] = (
            features_df['title'].astype(str) + ' ' + features_df['body'].astype(str)
        )
        text_col = 'combined_text'
    elif 'title' in features_df.columns:
        text_col = 'title'
        features_df['combined_text'] = features_df['title'].astype(str)
    elif 'body' in features_df.columns:
        text_col = 'body'
        features_df['combined_text'] = features_df['body'].astype(str)
    else:
        text_col = None
    
    if text_col:
        # Character and word counts
        features_df['text_length'] = features_df['combined_text'].str.len()
        features_df['word_count'] = features_df['combined_text'].str.split().str.len()
        features_df['has_text'] = (features_df['combined_text'].str.len() > 0).astype(int)
        
        # Title-specific features
        if 'title' in features_df.columns:
            features_df['title_length'] = features_df['title'].astype(str).str.len()
            features_df['title_word_count'] = features_df['title'].astype(str).str.split().str.len()
        
        # Body-specific features
        if 'body' in features_df.columns:
            features_df['body_length'] = features_df['body'].astype(str).str.len()
            features_df['body_word_count'] = features_df['body'].astype(str).str.split().str.len()
        
        # Text patterns (use regex=False to treat ? and ! as literal characters)
        features_df['has_question_mark'] = features_df['combined_text'].str.contains('?', regex=False, na=False).astype(int)
        features_df['has_exclamation'] = features_df['combined_text'].str.contains('!', regex=False, na=False).astype(int)
        features_df['uppercase_ratio'] = features_df['combined_text'].apply(
            lambda x: sum(1 for c in str(x) if c.isupper()) / len(str(x)) if len(str(x)) > 0 else 0
        )
    
    # 3. Engagement metrics
    if 'post_score' in features_df.columns:
        features_df['score'] = pd.to_numeric(features_df['post_score'], errors='coerce').fillna(0)
    elif 'score' in features_df.columns:
        features_df['score'] = pd.to_numeric(features_df['score'], errors='coerce').fillna(0)
    
    if 'comment_score' in features_df.columns:
        features_df['comment_score'] = pd.to_numeric(features_df['comment_score'], errors='coerce').fillna(0)
        features_df['comment_to_score_ratio'] = features_df['comment_score'] / (features_df['score'] + 1)
        features_df['total_engagement'] = features_df['score'] + features_df['comment_score']
    
    if 'comment' in features_df.columns:
        features_df['has_comment'] = (features_df['comment'].astype(str).str.len() > 0).astype(int)
        features_df['comment_length'] = features_df['comment'].astype(str).str.len()
    
    # 4. Subreddit features (encode categorical)
    if 'subreddit' in features_df.columns:
        # Load subreddit frequency from reddit_data_counts.json
        counts_file = dataset_path / "reddit_data_counts.json"
        if counts_file.exists():
            with open(counts_file, 'r', encoding='utf-8') as f:
                subreddit_freq_dict = json.load(f)
            # Map subreddit frequencies from the JSON file
            features_df['subreddit_frequency'] = features_df['subreddit'].map(subreddit_freq_dict).fillna(0)
            print(f"Loaded subreddit frequencies from {counts_file.name}")
        else:
            # Fallback: calculate from dataset if JSON file not found
            subreddit_counts = features_df['subreddit'].value_counts()
            features_df['subreddit_frequency'] = features_df['subreddit'].map(subreddit_counts)
            print("Warning: reddit_data_counts.json not found. Using calculated frequencies.")
        
        # Numeric encoding for Random Forest (based on unique subreddits in dataset)
        unique_subreddits = features_df['subreddit'].dropna().unique()
        subreddit_encoding = {sub: idx for idx, sub in enumerate(unique_subreddits)}
        features_df['subreddit_encoded'] = features_df['subreddit'].map(subreddit_encoding).fillna(-1)
    
    return features_df

# Apply feature engineering
df_features = extract_features(df)
print(f"Original columns: {len(df.columns)}")
print(f"Features after engineering: {len(df_features.columns)}")
print(f"\nNew features created:")
new_cols = [col for col in df_features.columns if col not in df.columns]
print(new_cols)


Loaded subreddit frequencies from reddit_data_counts.json
Original columns: 6
Features after engineering: 25

New features created:
['subreddit', 'combined_text', 'text_length', 'word_count', 'has_text', 'title_length', 'title_word_count', 'body_length', 'body_word_count', 'has_question_mark', 'has_exclamation', 'uppercase_ratio', 'score', 'comment_to_score_ratio', 'total_engagement', 'has_comment', 'comment_length', 'subreddit_frequency', 'subreddit_encoded']


In [54]:
# Handle  missing values
missing_before = df_features.isnull().sum()
print("Missing values before handling:")
print(missing_before[missing_before > 0])

# Fill missing values
df_features = df_features.fillna(0)

missing_after = df_features.isnull().sum()
print("\nMissing values after handling:")
print(missing_after[missing_after > 0])


Missing values before handling:
subreddit    1258
dtype: int64

Missing values after handling:
Series([], dtype: int64)
