In [1]:
import os
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import praw

from transformers import AutoTokenizer, pipeline

from thefuzz import fuzz,process
from word2number import w2n



In [2]:
with open('../reddit_api.json') as json_file:
    reddit_api_credentials = json.load(json_file)
    reddit_read_only = praw.Reddit(client_id=reddit_api_credentials['client_id'],
                                   client_secret=reddit_api_credentials['secret'],
                                   user_agent=reddit_api_credentials['user_agent']) 

subreddit = reddit_read_only.subreddit("AskDocs")

In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
age_extractor = pipeline("ner", model="../models/age_token_classification", tokenizer=tokenizer)
gender_extractor = pipeline("text-classification", 
                            model="../models/gender_training",
                            truncation=True, 
                            padding = True, )
subject_extractor = pipeline("text-classification", 
                            model="../models/subject_training",
                            truncation=True, 
                            padding = True, )

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [4]:
def resolve_age(age_extracts):
    if len(age_extracts) == 0:
        resolved_age = None
    else:
        units = [entity for entity in age_extracts if entity['entity'] == 'B-age_unit']
        if len(units) > 0:
            collection = ['years', 'months']
            resolved_unit = process.extract(units[0]['word'], collection, scorer=fuzz.ratio)[0][0]
        else:
            resolved_unit = 'years'
        age_words = [entity for entity in age_extracts if entity['entity'] in ['B-age', 'I-age']]
        if len(age_words) == 0:
            resolved_age = None
        else:
            try:
                resolved_age = int(age_words[0]['word'])
            except:
                try:
                    resolved_age = w2n.word_to_num(age_words[0]['word'])
                except:
                    resolved_age = None
        if resolved_unit == 'months':
            resolved_age = None
    return resolved_age

In [5]:
random_posts = [
    {
        "id": post.id,
        'post_text': f"{post.title}\n{post.selftext}",
        "score": post.score,
        'total_comments': post.num_comments,
        'post_url': post.url
    }
    for post in [
        subreddit.random() 
        for i in range(10)
    ]
]
random_posts_df = pd.DataFrame.from_dict(random_posts)

In [6]:
age_extracts = age_extractor([post['post_text'] for post in random_posts])
random_posts_df['resolved_age'] = [resolve_age(age_extract) for age_extract in age_extracts]
random_posts_df['resolved_gender'] = [extract['label'] 
                                      for extract in gender_extractor([post['post_text'] 
                                                                       for post in random_posts])]
random_posts_df['resolved_subject'] = [extract['label'] 
                                       for extract in subject_extractor([post['post_text'] 
                                                                         for post in random_posts])]
random_posts_df

Unnamed: 0,id,post_text,score,total_comments,post_url,resolved_age,resolved_gender,resolved_subject
0,157v9vu,Quick Question RE: interpretation of MRI image...,1,1,https://www.reddit.com/r/AskDocs/comments/157v...,36.0,Female,Other
1,157va8z,Is this Lyme or a spider bite/other?\n28 year ...,2,3,https://www.reddit.com/r/AskDocs/comments/157v...,28.0,Male,Self
2,157zo8z,"Mild abnormal uterine bleeding\n22f white, dia...",1,1,https://www.reddit.com/r/AskDocs/comments/157z...,22.0,Female,Other
3,157r418,Elevated lipase levels\n\n\nHi! I’m a 25 y/o F...,1,3,https://www.reddit.com/r/AskDocs/comments/157r...,25.0,Female,Other
4,158203x,Elective amputation?\n35F\n\nMy knee always hu...,1,1,https://www.reddit.com/r/AskDocs/comments/1582...,35.0,Female,Self
5,157vke2,"Mysterious Illness, please help.\nI am here ou...",1,1,https://www.reddit.com/r/AskDocs/comments/157v...,,Female,Self
6,15808b6,Itchy red spot the size of a quarter on my bac...,1,1,https://www.reddit.com/r/AskDocs/comments/1580...,26.0,Female,Other
7,157yd7j,Post Microdisectomy recommendations\nHi there!...,1,1,https://www.reddit.com/r/AskDocs/comments/157y...,32.0,Female,Self
8,1581iaz,"Dulling pain in abdomen\nHello,\n\nFor 2 month...",1,1,https://www.reddit.com/r/AskDocs/comments/1581...,,Female,Other
9,157x9fh,Arm pain near bicep and middle of arm (vein) a...,1,1,https://www.reddit.com/r/AskDocs/comments/157x...,21.0,Female,Other


In [7]:
if 'resolved_random_posts.csv' in os.listdir('../data/'):
    random_posts_df = pd.concat(
        [
            pd.read_csv('../data/resolved_random_posts.csv'),
            random_posts_df
        ]
    )
    random_posts_df.drop_duplicates(subset = 'id', keep = 'last', inplace = True)

random_posts_df.to_csv('../data/resolved_random_posts.csv', index = False)

In [9]:
random_posts_df

Unnamed: 0,id,post_text,score,total_comments,post_url,resolved_age,resolved_gender,resolved_subject
0,157zejc,Pain suspected to be related to femoral anteve...,1,1,https://www.reddit.com/r/AskDocs/comments/157z...,19.0,Female,Other
1,1581gd3,Weird rash\nI (F26) broke out into this rash a...,1,2,https://www.reddit.com/r/AskDocs/comments/1581...,,Female,Other
2,157xtiz,Swollen lymph nodes in neck ultrasound inconcl...,1,1,https://www.reddit.com/r/AskDocs/comments/157x...,44.0,Male,Other
3,157yc3r,"Was given an iron infusion in- hospital, and f...",5,7,https://www.reddit.com/r/AskDocs/comments/157y...,37.0,Female,Other
4,157z5bl,"Unknown Neurological Problem\nHello,\n\nTo sta...",3,3,https://www.reddit.com/r/AskDocs/comments/157z...,34.0,Female,Other
5,15807s3,How long will I be unable to hear?\nI am 18F I...,3,2,https://www.reddit.com/r/AskDocs/comments/1580...,18.0,Female,Other
6,157u6qp,"Vertigo\n42 M, 6'0"" 180 lbs\nNon smoker, non d...",1,1,https://www.reddit.com/r/AskDocs/comments/157u...,42.0,Female,Self
7,157xf3x,Crackling sound under right ear when I puff up...,1,1,https://www.reddit.com/r/AskDocs/comments/157x...,13.0,Male,Self
8,157r5lm,red and bruised nose after starting to wear gl...,4,1,https://www.reddit.com/r/AskDocs/comments/157r...,24.0,Male,Self
9,157y3ix,High Iron/Iron Saturation: how bad would the s...,3,1,https://www.reddit.com/r/AskDocs/comments/157y...,39.0,Male,Self
