In [1]:
import pandas as pd

# Load the dataset
file_path = "dataset_reddit-scraper-task_2025-03-28_19-30-24-130.csv"
df = pd.read_csv(file_path)

# Display basic info and first few rows
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 156 entries, body to username
dtypes: float64(20), object(136)
memory usage: 1.2+ MB


(None,
                                                 body categories/0  \
 0                                                NaN          new   
 1  I'm sure many people here would have seen the ...          NaN   
 2  I actually value Apple's approach to making ev...          NaN   
 3  There will never be decent AI on any of these ...          NaN   
 4  There are better and better , tinier and tinie...          NaN   
 
           categories/1 category communityName                 createdAt  \
 0  ?include_over_18=on      NaN           NaN  2008-01-25T03:43:06.000Z   
 1                  NaN      NaN       r/apple  2025-03-27T18:54:22.000Z   
 2                  NaN    apple       r/apple  2025-03-28T12:33:58.000Z   
 3                  NaN    apple       r/apple  2025-03-28T13:48:10.000Z   
 4                  NaN    apple       r/apple  2025-03-28T15:25:45.000Z   
 
     dataType                                        description displayName  \
 0  community  An unofficial comm

In [None]:
# Selecting relevant columns
columns_to_keep = [
    "body", "title", "communityName", "createdAt", "dataType", "upVotes", 
    "upVoteRatio", "url", "username"
]
df_cleaned = df[columns_to_keep].copy()

# Convert createdAt to datetime
df_cleaned["createdAt"] = pd.to_datetime(df_cleaned["createdAt"], errors="coerce")

df_cleaned["upVotes"] = pd.to_numeric(df_cleaned["upVotes"], errors="coerce").fillna(0).astype(int)
df_cleaned["upVoteRatio"] = pd.to_numeric(df_cleaned["upVoteRatio"], errors="coerce").fillna(0)

# Filling missing values in text columns with empty strings
df_cleaned["body"] = df_cleaned["body"].fillna("")
df_cleaned["title"] = df_cleaned["title"].fillna("")
df_cleaned["communityName"] = df_cleaned["communityName"].fillna("")
df_cleaned["username"] = df_cleaned["username"].fillna("Unknown")

# Display the cleaned dataset info and first few rows
df_cleaned.info(), df_cleaned.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   body           1000 non-null   object             
 1   title          1000 non-null   object             
 2   communityName  1000 non-null   object             
 3   createdAt      1000 non-null   datetime64[ns, UTC]
 4   dataType       1000 non-null   object             
 5   upVotes        1000 non-null   int64              
 6   upVoteRatio    1000 non-null   float64            
 7   url            1000 non-null   object             
 8   username       1000 non-null   object             
dtypes: datetime64[ns, UTC](1), float64(1), int64(1), object(6)
memory usage: 70.4+ KB


(None,
                                                 body  \
 0                                                      
 1  I'm sure many people here would have seen the ...   
 2  I actually value Apple's approach to making ev...   
 3  There will never be decent AI on any of these ...   
 4  There are better and better , tinier and tinie...   
 
                                                title communityName  \
 0                r/Apple: Unofficial Apple Community                 
 1  OpenAI's new image generation model is what Ge...       r/apple   
 2                                                          r/apple   
 3                                                          r/apple   
 4                                                          r/apple   
 
                   createdAt   dataType  upVotes  upVoteRatio  \
 0 2008-01-25 03:43:06+00:00  community        0         0.00   
 1 2025-03-27 18:54:22+00:00       post        0         0.34   
 2 2025-03-28 12:33:58+00:

In [5]:
import pandas as pd
import re

# Load dataset
file_path = "dataset_reddit-scraper-task_2025-03-28_19-30-24-130.csv"
df = pd.read_csv(file_path)

# Keep only relevant columns
columns_to_keep = ["body", "title", "communityName", "createdAt", "dataType", "upVotes", "upVoteRatio", "username"]
df = df[columns_to_keep]

# Convert createdAt to datetime format
df["createdAt"] = pd.to_datetime(df["createdAt"], errors="coerce")

# Convert numerical columns
df["upVotes"] = pd.to_numeric(df["upVotes"], errors="coerce").fillna(0).astype(int)
df["upVoteRatio"] = pd.to_numeric(df["upVoteRatio"], errors="coerce").fillna(0.0).astype(float)

# Fill missing text values with empty strings
text_columns = ["body", "title", "communityName", "username"]
df[text_columns] = df[text_columns].fillna("")

# Remove URLs from 'body' and 'title'
url_pattern = r"http[s]?://\S+"
df["body"] = df["body"].apply(lambda x: re.sub(url_pattern, "", x))
df["title"] = df["title"].apply(lambda x: re.sub(url_pattern, "", x))

# Save cleaned dataset
cleaned_file_path = "cleaned_reddit_dataset.csv"
df.to_csv(cleaned_file_path, index=False)

# Display sample rows
df.head()


Unnamed: 0,body,title,communityName,createdAt,dataType,upVotes,upVoteRatio,username
0,,r/Apple: Unofficial Apple Community,,2008-01-25 03:43:06+00:00,community,0,0.0,
1,I'm sure many people here would have seen the ...,OpenAI's new image generation model is what Ge...,r/apple,2025-03-27 18:54:22+00:00,post,0,0.34,krikrija
2,I actually value Apple's approach to making ev...,,r/apple,2025-03-28 12:33:58+00:00,comment,12,0.0,precipiceblades
3,There will never be decent AI on any of these ...,,r/apple,2025-03-28 13:48:10+00:00,comment,2,0.0,sherbert-stock
4,"There are better and better , tinier and tinie...",,r/apple,2025-03-28 15:25:45+00:00,comment,1,0.0,MrBread134


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingRegressor
from textblob import TextBlob

# Load data
df = pd.read_csv("dataset_reddit-scraper-task_2025-03-28_19-30-24-130.csv")

# Drop rows with NaN in target variable
df = df.dropna(subset=["upVotes"])

# Handle missing values
df['body'] = df['body'].fillna('')
df['imageUrls/0'] = df['imageUrls/0'].fillna('')
df['createdAt'] = pd.to_datetime(df['createdAt'])

# Extract datetime features
df['hour_of_day'] = df['createdAt'].dt.hour
df['day_of_week'] = df['createdAt'].dt.dayofweek

# Derived features
df["has_image"] = df["imageUrls/0"].apply(lambda x: 1 if x != '' else 0)
df['text_length'] = df['body'].apply(len)

# Cyclical encoding for time features
df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day'] / 24)
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

# Sentiment analysis with error handling
def get_sentiment(text):
    try:
        return TextBlob(str(text)).sentiment.polarity
    except:
        return 0  # Neutral sentiment for errors

df['sentiment'] = df['body'].apply(get_sentiment)

# Outlier removal (removing top 1% of upvotes)
upper_bound = df["upVotes"].quantile(0.99)
df = df[df["upVotes"] <= upper_bound]

# Log-transform the target variable to reduce variance
df["log_upVotes"] = np.log1p(df["upVotes"])  # log(1 + upVotes) avoids log(0)
y = df["log_upVotes"]

# Mean encoding for subreddit (communityName)
subreddit_avg_upvotes = df.groupby("communityName")["upVotes"].mean()
df["community_mean_upvotes"] = df["communityName"].map(subreddit_avg_upvotes)

# Define features
X = df[["body", "hour_sin", "hour_cos", "day_sin", "day_cos", 
        "has_image", "text_length", "sentiment", "community_mean_upvotes"]]

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(max_features=500, stop_words='english'), "body"),  # Reduced TF-IDF features
        ("num", StandardScaler(), ["hour_sin", "hour_cos", "day_sin", "day_cos", 
                                   "text_length", "sentiment", "community_mean_upvotes"]),
    ],
    remainder='drop'
)

# Gradient Boosting Model Pipeline
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'regressor__n_estimators': [200, 500],
    'regressor__learning_rate': [0.05, 0.1],
    'regressor__max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get best model
best_model = grid_search.best_estimator_

# Predict on test set
predictions = best_model.predict(X_test)

# Convert log predictions back to original scale
predictions = np.expm1(predictions)  # Reverse log(1 + upVotes)

# Convert true values back to original scale
y_test_original = np.expm1(y_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test_original, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_original, predictions)

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared Score: {r2}")

# Feature importance
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()
importances = best_model.named_steps['regressor'].feature_importances_

print("\nTop 10 Features:")
for name, importance in sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{name}: {importance:.4f}")


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Fitting 3 folds for each of 12 candidates, totalling 36 fits


ValueError: 
All the 36 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Hack-Nocturne-2025\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Hack-Nocturne-2025\venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\Hack-Nocturne-2025\venv\lib\site-packages\sklearn\pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "d:\Hack-Nocturne-2025\venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\Hack-Nocturne-2025\venv\lib\site-packages\sklearn\ensemble\_gb.py", line 658, in fit
    X, y = validate_data(
  File "d:\Hack-Nocturne-2025\venv\lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
  File "d:\Hack-Nocturne-2025\venv\lib\site-packages\sklearn\utils\validation.py", line 1387, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
  File "d:\Hack-Nocturne-2025\venv\lib\site-packages\sklearn\utils\validation.py", line 1397, in _check_y
    y = check_array(
  File "d:\Hack-Nocturne-2025\venv\lib\site-packages\sklearn\utils\validation.py", line 1107, in check_array
    _assert_all_finite(
  File "d:\Hack-Nocturne-2025\venv\lib\site-packages\sklearn\utils\validation.py", line 120, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "d:\Hack-Nocturne-2025\venv\lib\site-packages\sklearn\utils\validation.py", line 169, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input y contains NaN.
