In [None]:
%pip install git+https://github.com/huggingface/transformers.git
%pip install Pillow redvid
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


In [16]:
import requests
from redvid import Downloader
import subprocess
from PIL import Image


def isMediaDomain(url):
  for mediaDomain in mediaDomains.keys():
    if mediaDomain in url or url.startswith('self.'):
      return True
  return False

video_input_file = 'video.mp4'


img_output_file = 'image.jpg'
video_output_file = img_output_file#'video.jpg'

def downloadRedditVideo(url):
  # delete video.mp4 if it exists
  subprocess.call(['rm', '-f', video_input_file])
  reddit = Downloader(max_q=True)
  reddit.log = False
  reddit.url = url
  reddit.path = "./"
  reddit.filename = video_input_file
  reddit.download()
  print("Downloaded " + video_input_file)

  image = extractFrameFromVideo()

  image = Image.open(video_output_file)
  return image

def extractFrameFromVideo():
  input_file = video_input_file
  output_file = video_output_file

  #delete video.jpg if it exists
  subprocess.call(['rm', '-f', output_file])

  duration = float(subprocess.check_output(['ffprobe', '-i', input_file, '-show_entries', 'format=duration', '-v', 'quiet', '-of', 'csv=%s' % ("p=0")]).strip())
  middle_time = duration / 2
  #extract the frame
  subprocess.call(['ffmpeg', '-i', input_file, '-ss', str(middle_time), '-vframes', '1', '-q:v', '2', output_file], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
  print("Extracted frame from " + input_file + " to " + output_file)

  image = Image.open(output_file)
  return image


def generate_blip2(image, context=None):
  if context:
    inputs = processor(images=image, text=context, return_tensors="pt").to(device, torch.float16)
  else:
    inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
  generated_ids = model.generate(**inputs, max_new_tokens=100)
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
  return generated_text

def downloadVideo(url):
  if 'reddit' in url:
    return downloadRedditVideo(url)
  else:
    video_data = requests.get(url).content
    with open(video_input_file, 'wb') as handler:
      handler.write(video_data)
    
    return extractFrameFromVideo()


def downloadImage(url):
  print("Downloading " + url)
  if 'gifv' in url:
    # change to gif
    url = url.replace('gifv', 'mp4')
    return downloadVideo(url)
  img_data = requests.get(url).content
  with open(img_output_file, 'wb') as handler:
      handler.write(img_data)

  image = Image.open(img_output_file)
  return image


def downloadYoutubeThumbnail(url):
  if 'youtu.be' in url:
    # after / and before ?
    youtube_id = url.split('/')[-1]
    youtube_id = youtube_id.split('?')[0]
    youtube_id = youtube_id.split('&')[0]
  else:
    youtube_id = url.split('v=')[-1]
    youtube_id = youtube_id.split('?')[0]
    youtube_id = youtube_id.split('&')[0]
  print("youtube_id", youtube_id)
  thumbnail_url = 'https://img.youtube.com/vi/' + youtube_id + '/maxresdefault.jpg'
  print("thumbnail_url", thumbnail_url)
  return downloadImage(thumbnail_url)

def downloadRedditMedia(domain, url):
  # delete image.jpg if it exists
  subprocess.call(['rm', '-f', img_output_file])
  #delete video.mp4 if it exists
  subprocess.call(['rm', '-f', video_input_file])
  if domain in mediaDomains:
    return mediaDomains[domain](url)
  else:
    return None
  
  # call the right function based on domain
  func = mediaDomains[domain]
  return func(url)

def classifyMedia(url):
  for mediaDomain in mediaDomains.keys():
    if mediaDomain in url or url.startswith('self.'):
      # check if mediaDomain value is downloadImage
      if mediaDomains[mediaDomain] == downloadImage:
        return "image"
      else:
        return "video"
  return "article"

mediaDomains = {
    "i.redd.it": downloadImage,
    "i.reddituploads.com": downloadImage,
    "v.redd.it": downloadRedditVideo,
    "i.imgur.com": downloadImage,
    "youtu.be": downloadYoutubeThumbnail,
    "youtube.com": downloadYoutubeThumbnail,
  }


In [None]:
# load image.jpg
image = Image.open('image.jpg')
generate_blip2(image, "Question: What is the title of this picture? Answer: german riot police defeated and humiliated by some kind of mud wizard. Question: What is happening? Answer: ")

## load subreddits

In [17]:
import glob

json_files = glob.glob('./data/*.json')
print(json_files)

import json

def getRedditSubreddits(json_files):
  subreddits = {}
  for json_file in json_files:
    with open(json_file) as f:
      data = json.load(f)
      firstKey = next(iter(data))
      subredditName = data[firstKey]['subreddit']
      subreddits[subredditName] = data
  return subreddits

def saveredditSubreddits(subreddits):
  for subreddit in subreddits:
    with open('./data/' + subreddit + '_top_posts.json', 'w') as outfile:
      json.dump(subreddits[subreddit], outfile, indent=4)

subreddits = getRedditSubreddits(json_files)

print("subreddits", subreddits.keys())


['./data/facepalm_top_posts.json', './data/announcements_top_posts.json', './data/DunderMifflin_top_posts.json', './data/wholesomememes_top_posts.json', './data/science_top_posts.json', './data/BikiniBottomTwitter_top_posts.json', './data/freefolk_top_posts.json', './data/MadeMeSmile_top_posts.json', './data/StarWarsBattlefront_top_posts.json', './data/books_top_posts.json', './data/tifu_top_posts.json', './data/nextfuckinglevel_top_posts.json', './data/EarthPorn_top_posts.json', './data/gifs_top_posts.json', './data/AmItheAsshole_top_posts.json', './data/todayilearned_top_posts.json', './data/me_irl_top_posts.json', './data/WatchPeopleDieInside_top_posts.json', './data/thanosdidnothingwrong_top_posts.json', './data/Showerthoughts_top_posts.json', './data/Wellthatsucks_top_posts.json', './data/wallstreetbets_top_posts.json', './data/nottheonion_top_posts.json', './data/comics_top_posts.json', './data/PewdiepieSubmissions_top_posts.json', './data/awfuleverything_top_posts.json', './data

## Download images and caption them

In [37]:
for subreddit_name in subreddits:
  subreddit = subreddits[subreddit_name]
  print("subreddit", subreddit_name)
  for post_id in subreddit:
    post = subreddit[post_id]
    isValidUrlPost = post['url'] != None and post['text'] == None and isMediaDomain(post['url'])
    if not isValidUrlPost:
      continue

    #print url
    print(post['url'])
    print(post['title'])
    image = downloadRedditMedia(post['domain'], post['url'])
    #display image
    #image.show()
    #generate caption
    caption = generate_blip2(image)#, "Question: What is the title of this picture? Answer: " + post['title'] + " Question: What is happening? Answer: ")
    print(caption)
    #save caption
    post['text'] = caption
    

saveredditSubreddits(subreddits)
    

subreddit facepalm
subreddit announcements
subreddit DunderMifflin
https://i.imgur.com/KXLKw9W.jpg
Custom background for your Zoom and Teams calls. You’re welcome.
Downloading https://i.imgur.com/KXLKw9W.jpg
person
https://i.redd.it/sr47ivysjdm71.jpg
All in the feels
Downloading https://i.redd.it/sr47ivysjdm71.jpg
steve jobs and steve wozniak
https://i.redd.it/kdhjymipgyo71.jpg
Mindy Kaling shuttin’ ‘em down
Downloading https://i.redd.it/kdhjymipgyo71.jpg
the office gifs twitter
subreddit wholesomememes
https://i.redd.it/eihhjg3veeo31.jpg
What a considerate man
Downloading https://i.redd.it/eihhjg3veeo31.jpg
a man is sitting in a gas station and he's talking to his friend
https://i.redd.it/l9kr8x9xmmm41.jpg
Wholesome meeting from Tumblr
Downloading https://i.redd.it/l9kr8x9xmmm41.jpg
two men in a store with food on the shelves
https://i.redd.it/jr4p03glkb061.jpg
As real as it gets : )
Downloading https://i.redd.it/jr4p03glkb061.jpg
a cat and a man with a beard
https://i.redd.it/02bebmi

## Classify each post with image video or text

In [18]:
# add a ['classification'] field to each post
# ['classification'] = 'article' or 'image' or 'video' or 'text'

for subreddit_name in subreddits:
  subreddit = subreddits[subreddit_name]
  print("subreddit", subreddit_name)
  for post_id in subreddit:
    post = subreddit[post_id]
    if post['url'] == None:
      continue
    post['classification'] = classifyMedia(post['url'])
    print(post['classification'], post['url'], post['text'])

saveredditSubreddits(subreddits)

subreddit facepalm
image https://i.redd.it/8l6e0krhdc251.png a man is being held by a police officer in front of a crowd
video https://v.redd.it/udmx4ur0y7ca1 a man in a hooded robe is being pulled out of the mud
image https://i.redd.it/6mvpynwq0e1b1.png i had an ankle fracture surgery cancel this morning because his guy couldn't walk
video https://v.redd.it/l60f8aftxqv81 a man in a suit and tie sitting at a desk
video https://v.redd.it/hh4izuoxm11b1 a woman is looking at a laptop in a store
image https://i.redd.it/ypuj1e6u52f81.jpg a twee with a woman and a man on it
image https://i.redd.it/2hb9rutko0n61.jpg my father in law believes a youtube video that two doctors told him vaccines change your dna how is your weekend going?
image https://i.redd.it/omyb4g6wwby41.jpg a screenshot of facebook messages showing different people
image https://i.redd.it/7rks4qr3asg51.png the economist is viewed favourably by 99% of americans despite the fact that it loses money every year
video https://v.r

In [7]:
jsonl = []
for subreddit_name in subreddits:
  subreddit = subreddits[subreddit_name]
  #print("subreddit", subreddit_name)
  for post_id in subreddit:
    post = subreddit[post_id]
    _instruction = f"You are a Reddit user. \nYou want to post a submission on the subreddit /r/{subreddit_name}"
    _instruction = f"<<SYS>>\n{_instruction}\n<</SYS>>\n\n"
    _instruction = f"{_instruction}[INST] Remember you are a Reddit user. People may refer to you as OP or Original Poster. [/INST]"
    _input = f"OP: Title: {post['title']} \n Submission:"
    # if post['text'] doesnt start with \n then add it
    
    _output = post['text']
    if _output and not _output.startswith('\n'):
      _output = '\n' + _output
    jsonl_template = {
        "instruction": _instruction,
        "input": _input,
        "output": _output,
        "text": f"<s> {_instruction} {_input} {_output} </s>"
    }
    jsonl.append(jsonl_template)

with open('reddit.jsonl', 'w') as outfile:
  for entry in jsonl:
    outfile.write(json.dumps(entry) + '\n')

    