In [1]:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import time
import pandas as pd
import numpy as np
import glob

In [36]:
# Starting with a single review document with a decent number of reviews
reviewFile = 'Reviews/bupropion-HCl_reviews.csv'
reviewDF = pd.read_csv(reviewFile, sep='$', 
                       index_col=1, usecols=(1,2,3,4,5,6,7),
                       converters={'date': lambda x: pd.to_datetime(x)})

In [37]:
reviewDF

Unnamed: 0_level_0,conditionInfo,reviewer,Effectiveness,Satisfaction,Ease of Use,Comment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-07 09:20:01,Condition: Depression,Reviewer: 25-34 Male (Patient),4,5,5,No Script or health Insurance needed to place ...
2019-04-18 09:00:45,Condition: Depression,"Reviewer: cant sleep, 35-44 Male on Treatment...",3,3,4,"i just started this for a week now, all of a s..."
2019-01-04 11:44:46,Condition: Depression,"Reviewer: teechur, 45-54 Female on Treatment ...",4,3,5,I started on Bupropion to deal with my depress...
2019-01-03 10:05:42,Condition: Depression,"Reviewer: guigeek, 45-54 Female on Treatment ...",5,5,5,I have been taking Bupropion HCI XL 300 mg for...
2018-01-23 15:40:03,Condition: Depression,Reviewer: 45-54 Female on Treatment for 6 mon...,3,1,4,I have taken this for aprox 7 months. I have ...
...,...,...,...,...,...,...
2007-10-22 17:23:23,Condition: Depression,"Reviewer: k2skier, 25-34 Female on Treatment ...",4,5,5,Have tried many different antidepressant/mood ...
2007-10-14 12:56:40,Condition: Depression,"Reviewer: ruthie, 45-54 Female on Treatment f...",1,1,4,
2007-09-29 19:00:31,Condition: Depression,"Reviewer: robby 36, 65-74 Male on Treatment f...",4,4,4,"this drug caussed calmness, no anger, but a lo..."
2007-09-27 16:07:15,Condition: Depression,"Reviewer: FargoUT, 25-34 Male on Treatment fo...",4,4,5,"After the loss of a loved one, I fell into a s..."


In [38]:
def parse_reviewer(reviewer):
    # Find name as unique identifier if present
    if reviewer.find(',') != -1:
        name = reviewer[reviewer.find(':')+2:reviewer.find(',')]
    else:
        name = np.NaN
    
    # Find age range as datapoint if present
    if reviewer.find('-') != -1:
        if reviewer.find(',') != -1:
            age = reviewer[reviewer.find(',')+2:reviewer.find(' ', reviewer.find('-'))]
        else:
            age = reviewer[reviewer.find(':')+2:reviewer.find(' ', reviewer.find('-'))]
    else:
        age = np.NaN
        
    # Find gender if present
    if reviewer.find('Male') != -1:
        gender = 'Male'
    elif reviewer.find('Female') != -1:
        gender = 'Female'
    else:
        gender = np.NaN
        
    # Find treatment time
    if reviewer.find('on Treatment') != -1:
        if reviewer.rstrip()[-1] == ')':
            treatment_time = reviewer[reviewer.find('on Treatment for ')+16:reviewer.rfind('(')].strip()
        else:
            treatment_time = reviewer[reviewer.find('on Treatment for ')+16:].rstrip().strip()
    else:
        treatment_time = np.NaN
    
    # Put info in a dictionary that can be made into a dictionary
    reviewer_info = {}
    reviewer_info['Name'] = name
    reviewer_info['Age'] = age
    reviewer_info['Gender'] = gender
    reviewer_info['Length of treatment'] = treatment_time
    
    return reviewer_info

In [39]:
# Parse the reviewer info
reviewers = []
for reviewer in reviewDF['reviewer']:
    reviewers.append(parse_reviewer(reviewer))
reviewersDF = pd.DataFrame(reviewers, index=reviewDF.index)

In [40]:
# Drop the reviewer column from the original dataframe
reviewDF = reviewDF.drop(columns=['reviewer'])

In [41]:
# Add the parsed reviewer info to the original dataframe
reviewDF = pd.concat([reviewDF, reviewersDF], axis=1)

In [43]:
# Removing the label "Condition" from conditionInfo
conditions = []
for cond in reviewDF['conditionInfo']:
    conditions.append(cond.replace('Condition:','').strip())

reviewDF['conditionInfo'] = conditions

In [44]:
reviewDF

Unnamed: 0_level_0,conditionInfo,Effectiveness,Satisfaction,Ease of Use,Comment,Name,Age,Gender,Length of treatment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-01-07 09:20:01,Depression,4,5,5,No Script or health Insurance needed to place ...,,25-34,Male,
2019-04-18 09:00:45,Depression,3,3,4,"i just started this for a week now, all of a s...",cant sleep,35-44,Male,less than 1 month
2019-01-04 11:44:46,Depression,4,3,5,I started on Bupropion to deal with my depress...,teechur,45-54,Female,1 to less than 2 years
2019-01-03 10:05:42,Depression,5,5,5,I have been taking Bupropion HCI XL 300 mg for...,guigeek,45-54,Female,10 years or more
2018-01-23 15:40:03,Depression,3,1,4,I have taken this for aprox 7 months. I have ...,,45-54,Female,6 months to less than 1 year
...,...,...,...,...,...,...,...,...,...
2007-10-22 17:23:23,Depression,4,5,5,Have tried many different antidepressant/mood ...,k2skier,25-34,Female,6 months to less than 1 year
2007-10-14 12:56:40,Depression,1,1,4,,ruthie,45-54,Female,6 months to less than 1 year
2007-09-29 19:00:31,Depression,4,4,4,"this drug caussed calmness, no anger, but a lo...",robby 36,65-74,Male,2 to less than 5 years
2007-09-27 16:07:15,Depression,4,4,5,"After the loss of a loved one, I fell into a s...",FargoUT,25-34,Male,2 to less than 5 years


In [47]:
for comment in reviewDF['conditionInfo']:
    print(comment, '\n')

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 

Depression 
