In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
os.chdir('/content/drive/My Drive/Projects/reddit-vote-predictor')
import multiprocessing as mp
import sys
import shutil as sh
import pandas as pd
import numpy as np
import statsmodels.api as sm
import datetime as dt
import matplotlib.pyplot as plt
from scipy import stats as s
from skimage import io
from mpl_toolkits.axes_grid1 import ImageGrid
import datetime as dt
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torchvision import models
# For object detection model:
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from PIL import Image, ImageDraw
# For loading images from URL:
import requests
from io import BytesIO
from tqdm import tqdm

In [3]:
print(torch.cuda.is_available())  # Should print True
print(torch.cuda.get_device_name(0))  # Prints the GPU name

True
Tesla T4


#### Read in data

The previous scripts did the following
1. Scrape posts from a given subreddit (in this case r/dog and r/cat) within a given time frame
2. Delete posts not containing an image as the `post_hint`
3. Then use Reddit's API to pull additional data, such as `upvote_ratio`
4. Delete posts that have one upvote since this is indistinguishable from posts that received the default one upvote upon posting and nothing else.
5. Delete posts for which the URL links to an image that cannot be found
6. Delete posts for which an object-detection model cannot locate a dog (or cat).


In [4]:
dog = pd.read_csv('data/tbl/dog-final-data_upvote-ratio-model.csv', index_col = None)
cat = pd.read_csv('data/tbl/cat-final-data_upvote-ratio-model.csv', index_col = None)
# Make sure x and y are int
dog[['x', 'y']] = dog[['x', 'y']].applymap(int)
cat[['x', 'y']] = cat[['x', 'y']].applymap(int)
print(dog.shape)
print(cat.shape)

(2787, 29)
(4314, 29)


## Upvote Ratio Model Training


#### Preprocessing via Dataloader

1. Load image from URL
2. Check color format and convert to RGB if necessary
3. Crop size 224 from the center of the bounding box (if not possible obtain same size area from the border of image)
4. Resize to 224, 224
5. Convert to tensor

In [5]:
def download_img(url_):
  try:
    response = requests.get(url_)
    image = Image.open(BytesIO(response.content))
    return image
  except requests.RequestException:
    # Handle network errors here
    raise Exception(f"Error downloading image from {url_}")

def ensure_RGB(image_):
  if image_.mode != 'RGB':
    image_ = image_.convert('RGB')
  return image_

def crop_image(image_, x, y, output_size = (224, 224)):
  # Calculate the crop boundaries
  width, height = image_.size
  left = max(0, x - output_size[0] // 2)
  top = max(0, y - output_size[1] // 2)
  right = min(width, x + output_size[0] // 2)
  bottom = min(height, y + output_size[1] // 2)
  return image_.crop((left, top, right, bottom))


# The rest of the transformations use torch.transforms
torch_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [7]:
# Custom Dataset using image URLs
class CustomUrlDataset(Dataset):
    def __init__(self, URLs, targets, crop_Xs, crop_Ys, transform = None):
        self.img_urls = URLs
        self.X = crop_Xs
        self.Y = crop_Ys
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.img_urls)

    def __getitem__(self, idx):
      # Get image-specific data
      img_url = self.img_urls[idx]
      x = self.X[idx]
      y = self.Y[idx]
      target = self.targets[idx]

      # Download the image
      image = download_img(img_url)

      # ensure color format is RGB
      image = ensure_RGB(image)

      # Crop around the target object
      image = crop_image(image, x, y)

      # Apply the pytorch transforms if supplied
      if self.transform:
          image = self.transform(image)

      return image, target

In [19]:
import time
# Download all the images in advance to avoid unexpected connection errors
for i in tqdm(range(dog.shape[0])):
  _id_ = dog.id[i]
  _url_ = dog.FEATURE[i]
  try:
    _resp_ = requests.get(_url_)
    _img_ = Image.open(BytesIO(_resp_.content))
    _img_ = ensure_RGB(_img_)
  except ConnectionResetError:
    time.sleep(5)
    # retry
    _resp_ = requests.get(_url_)
    _img_ = Image.open(BytesIO(_resp_.content))
    _img_ = ensure_RGB(_img_)
  _path_ = os.path.join('data/img/upvote_ratio_final/dog', str(_id_) + '.jpeg')
  # Ensure the save directory exists
  # Save the image as a JPG file
  _img_.save(_path_, "JPEG")



100%|██████████| 2787/2787 [10:36<00:00,  4.38it/s]
100%|██████████| 4314/4314 [18:35<00:00,  3.87it/s]


In [None]:
del _id_, _url_, _resp_, _img_, _path_
for i in tqdm(range(cat.shape[0])):
  _id_ = cat.id[i]
  _url_ = cat.FEATURE[i]
  try:
    _resp_ = requests.get(_url_)
    _img_ = Image.open(BytesIO(_resp_.content))
    _img_ = ensure_RGB(_img_)
  except ConnectionResetError:
    time.sleep(5)
    # retry
    _resp_ = requests.get(_url_)
    _img_ = Image.open(BytesIO(_resp_.content))
    _img_ = ensure_RGB(_img_)
  _path_ = os.path.join('data/img/upvote_ratio_final/cat', str(_id_) + '.jpeg')
  # Ensure the save directory exists
  # Save the image as a JPG file
  _img_.save(_path_, "JPEG")


In [6]:
def read_img(filename):
  return Image.open(filename)

# Custom Dataset using image paths
class CustomImgDataset(Dataset):
  def __init__(self, image_directory, image_idx, targets, crop_Xs, crop_Ys, transform=None):
    # parse arguments
    self.img_dir = image_directory
    self.img_idx = image_idx
    self.X = crop_Xs
    self.Y = crop_Ys
    self.targets = targets
    self.transform = transform

  def __len__(self):
    return len(self.img_idx)

  def __getitem__(self, idx):
    # Get image-specific data
    file_path = os.path.join(self.img_dir, str(self.img_idx[idx]) + '.jpeg')
    x = self.X[idx]
    y = self.Y[idx]
    target = self.targets[idx]

    # Read the image from file
    image = read_img(file_path)

    # Ensure color format is RGB
    image = ensure_RGB(image) # likely redundant but just to be safe.

    # Crop around the target object
    image = crop_image(image, x, y)

    # Apply the PyTorch transforms if supplied
    if self.transform:
        image = self.transform(image)

    return image, target

#### Set model params for `r/dog`

In [9]:
## Data setup ##

data = dog # data frame (from global environment via pd.read_csv above)
FEATURE = 'FEATURE' # variable name of image url in data
TARGET = 'upvote_ratio' # variable name of target in data
#IMG_URL_VEC = data[FEATURE] # vector of feature url
IMG_DIR = 'data/img/upvote_ratio_final/dog'
IMG_ID_VEC = data['id']
TARGET_VEC = torch.tensor(data[TARGET].values, dtype=torch.float32) # target vector
X = data['x'] # vector of x coordinates of cropping center pixel
Y = data['y'] # vector of y coordinates of cropping center pixel

#-----------#


## GPU setup ##

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

#-----------#


## Data Loaders ##
# USING IMAGE DIRECTORY METHOD
dataset = CustomImgDataset(IMG_DIR, IMG_ID_VEC, TARGET_VEC, X, Y, torch_transforms)
batch_size = 32
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

#-----------#


## VGG-16 model ##

# Get the pretrained model
model = models.vgg16(pretrained=True)
# Freeze the pretrained layers
for param in model.parameters():
    param.requires_grad = False
# Unfreeze and modify the last layer for regression
model.classifier[6] = nn.Linear(4096, 1)
# Move the model to the GPU if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

#-----------#


## Loss Function and Optimizer ##

# Mean Squared Error (MSE) Loss
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

#-----------#




#### Train the model

In [10]:
## Training Loop ##

# Final parameter setup and logging
num_epochs = 100
#num_epochs = 1
print_every = 5
#print_every = 1

#-----------#

# Train the model
for epoch in tqdm(range(num_epochs)):
    model.train()
    for images, targets in train_loader:
        optimizer.zero_grad()
        images, targets = images.to(device), targets.to(device)  # Move data to GPU
        if images is None or targets is None:
          continue
        outputs = model(images)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
    #print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
    if (epoch + 1) % print_every == 0:  # Print every 5 epochs
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

#-----------#

  5%|▌         | 5/100 [05:33<1:44:52, 66.24s/it]

Epoch [5/100], Loss: 0.0732


 10%|█         | 10/100 [10:50<1:35:13, 63.48s/it]

Epoch [10/100], Loss: 0.0140


 15%|█▌        | 15/100 [16:01<1:28:19, 62.35s/it]

Epoch [15/100], Loss: 0.0305


 20%|██        | 20/100 [21:23<1:25:36, 64.21s/it]

Epoch [20/100], Loss: 0.1183


 25%|██▌       | 25/100 [26:38<1:18:41, 62.96s/it]

Epoch [25/100], Loss: 0.2442


 30%|███       | 30/100 [31:54<1:13:26, 62.94s/it]

Epoch [30/100], Loss: 0.0585


 35%|███▌      | 35/100 [37:07<1:08:33, 63.29s/it]

Epoch [35/100], Loss: 0.0754


 40%|████      | 40/100 [42:33<1:04:37, 64.63s/it]

Epoch [40/100], Loss: 0.1364


 45%|████▌     | 45/100 [47:55<58:42, 64.05s/it]  

Epoch [45/100], Loss: 0.2009


 50%|█████     | 50/100 [53:09<52:31, 63.03s/it]

Epoch [50/100], Loss: 0.0190


 55%|█████▌    | 55/100 [58:22<46:45, 62.35s/it]

Epoch [55/100], Loss: 0.0770


 60%|██████    | 60/100 [1:03:37<42:14, 63.37s/it]

Epoch [60/100], Loss: 0.0482


 65%|██████▌   | 65/100 [1:09:03<37:47, 64.78s/it]

Epoch [65/100], Loss: 0.0104


 70%|███████   | 70/100 [1:14:19<31:37, 63.25s/it]

Epoch [70/100], Loss: 0.1371


 75%|███████▌  | 75/100 [1:19:33<26:08, 62.73s/it]

Epoch [75/100], Loss: 0.0234


 80%|████████  | 80/100 [1:24:36<20:15, 60.80s/it]

Epoch [80/100], Loss: 0.0026


 85%|████████▌ | 85/100 [1:29:46<15:33, 62.25s/it]

Epoch [85/100], Loss: 0.1663


 90%|█████████ | 90/100 [1:34:55<10:19, 61.94s/it]

Epoch [90/100], Loss: 0.0439


 95%|█████████▌| 95/100 [1:40:07<05:12, 62.50s/it]

Epoch [95/100], Loss: 0.0202


100%|██████████| 100/100 [1:45:21<00:00, 63.22s/it]

Epoch [100/100], Loss: 0.0985





Save model

In [11]:
torch.save(model.state_dict(), 'models/dog/vanilla_upvote_ratio_model.pth')

#### Evaluate on a test image

In [12]:
# Use the test image of the golden retriever with the one ear
model.eval()
test_image_url = 'https://media.cnn.com/api/v1/images/stellar/prod/200313124810-02-rae-golden-retriever.jpg'
try:
    test_image = torch_transforms(ensure_RGB(download_img(test_image_url))).unsqueeze(0)
except requests.RequestException:
    raise Exception(f"Error processing test image")

with torch.no_grad():
    test_image = test_image.to(device)  # Move test image to GPU
    predicted_target = model(test_image).item()

print(f"Predicted Target: {predicted_target:.4f}")

Predicted Target: 0.6965


### With r/cat (using a training and validation sample)

In [9]:
## Data setup ##
batch_size = 32
train = cat.sample(frac=0.8) # 80% in train
test = cat.drop(train.index) # The other 20% in test
# The __getitem__ method in the dataloader gets data based on dataframe index
# so reset it here so that indices 0:shape[0] correspond to the data in order
train = train.reset_index(drop = True)
test = test.reset_index(drop = True)
FEATURE = 'FEATURE' # variable name of image url in data
TARGET = 'upvote_ratio' # variable name of target in data
#IMG_URL_VEC = data[FEATURE] # vector of feature url
IMG_DIR = 'data/img/upvote_ratio_final/cat'
# Train data:
IMG_ID_VEC_train = train['id']
TARGET_VEC_train = torch.tensor(train[TARGET].values, dtype=torch.float32) # target vector
X_train = train['x'] # vector of x coordinates of cropping center pixel
Y_train = train['y'] # vector of y coordinates of cropping center pixel
# Repeat for test
IMG_ID_VEC_test = test['id']
TARGET_VEC_test = torch.tensor(test[TARGET].values, dtype=torch.float32) # target vector
X_test = test['x'] # vector of x coordinates of cropping center pixel
Y_test = test['y']

#-----------#


## GPU setup ##

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

#-----------#


## Data Loaders ##
# USING IMAGE DIRECTORY METHOD
train_dataset = CustomImgDataset(IMG_DIR, IMG_ID_VEC_train, TARGET_VEC_train, X_train, Y_train, torch_transforms)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# Repeat for test
test_dataset = CustomImgDataset(IMG_DIR, IMG_ID_VEC_test, TARGET_VEC_test, X_test, Y_test, torch_transforms)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
#-----------#


## VGG-16 model ##

# Get the pretrained model
model = models.vgg16(pretrained=True)
# Freeze the pretrained layers
for param in model.parameters():
    param.requires_grad = False
# Unfreeze and modify the last layer for regression
model.classifier[6] = nn.Linear(4096, 1)
# Move the model to the GPU if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

#-----------#


## Loss Function and Optimizer ##

# Mean Squared Error (MSE) Loss
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

#-----------#



#### Train r/cat model

In [10]:
## Training Loop ##

# Final parameter setup and logging
num_epochs = 100
print_every = 5
#num_epochs = 1
#print_every = 1

#-----------#

# Train the model
for epoch in tqdm(range(num_epochs)):
    model.train()
    for images, targets in train_loader:
        optimizer.zero_grad()
        images, targets = images.to(device), targets.to(device)  # Move data to GPU
        if images is None or targets is None:
          continue
        outputs = model(images)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
    #print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
    if (epoch + 1) % print_every == 0:  # Print every 5 epochs
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

#-----------#

100%|██████████| 1/1 [28:05<00:00, 1685.78s/it]

Epoch [1/1], Loss: 0.0509





Save model

In [29]:
torch.save(model.state_dict(), 'models/cat/vanilla_upvote_ratio_model.pth')

In [28]:
# Validate on test set
model.eval()

all_predictions = []

for images, targets in tqdm(test_loader):
    with torch.no_grad():
        images = images.to(device)  # Move test images to GPU if available
        outputs = model(images)
        batch_predictions = outputs.squeeze(dim=1).cpu().numpy()  # Convert to NumPy array
        all_predictions.extend(batch_predictions.tolist())


test['y_pred'] = np.array(all_predictions)

print(f"RMSE: {np.sqrt(np.mean((test.y_pred - test.upvote_ratio)**2))}")

RMSE: 0.20316930251549695


In [None]:
# Try: predict number of comments (or comments / num_subscribers).
# The former is correlated .45 with score, but not correlated with year.
# Note that year is correlated with num subscribers.
# After predicting num comments, a second model can predict score, or
# the NN can predict score / num_comments.
# The second model can also be an NLP model, or accont for other tabular data.

# And add early stopping