# Data Collection

## Imports

In [1]:
import pandas as pd
import datetime as dt
import time
import requests

## Scraping Function to Collect Data From Reddit

In [2]:
## Function Written by Mahdi Shadkam Farrokhi (General Assembly - NYC)
## Time window modified to scrape in 24 hour periods
## Data collected for 18 months

def query_pushshift(subreddit, kind = 'submission', hour_window = 24, n = 5):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}h".format(stem, hour_window * i)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(3) 
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full 

## General Topics in Women's Health

### Subreddit 1: r/WomensHealth

In [3]:
womens_health = query_pushshift('WomensHealth', kind = 'submission', hour_window = 24, n = 540)

Query Complete!


### Subreddit 2: r/ObGyn

In [None]:
obgyn = query_pushshift('obgyn', kind = 'submission', hour_window = 24, n = 540)

### Subreddit 3: r/TheGirlSurvivalGuide

In [None]:
girl_survival_guide = query_pushshift('thegirlsurvivalguide', kind = 'submission', hour_window = 24, n = 540)

### Combine Data Frames and Export to CSV
This cell is currently commented out to prevent overwriting of the collected data.

In [None]:
# # Concatenate Data Frames
# womens_health_subreddits = pd.concat([womens_health, obgyn])
# womens_health_subreddits = pd.concat([womens_health_subreddits, girl_survival_guide])
# womens_health_subreddits['subreddit'].value_counts()

# # Export to CSV
# womens_health_subreddits.to_csv('womens_health_2.csv')
print('Data has already been exported.')

## Fertility and Pregnancy

### Subreddit 4: r/TryingForABaby

In [None]:
trying_for_a_baby = query_pushshift('TryingForABaby', kind = 'submission', hour_window = 24, n = 540)

### Subreddit 5: r/Pregnant

In [None]:
pregnant = query_pushshift('Pregnant', kind = 'submission', hour_window = 24, n = 540)

### Subreddit 6: r/BabyBumps

In [None]:
baby_bumps = query_pushshift('BabyBumps', kind = 'submission', hour_window = 24, n = 540)

### Combine Data Frames and Export to CSV
This cell is currently commented out to prevent overwriting of the collected data.

In [None]:
# fertility_and_pregnancy = pd.concat([trying_for_a_baby, pregnant])
# fertility_and_pregnancy = pd.concat([fertility_and_pregnancy, baby_bumps])

# fertility_and_pregnancy.to_csv('fertility_and_pregnancy_2.csv')
print('Data has already been exported.')