# In this notebook I will:
* Parse the reviewer column into actually useful information
* Go through and split dataframes based on reviews that have text and those that do not (and save the second to their own file)
* Save files with processed reviews

In [77]:
import pandas as pd
import numpy as np
import glob

# Haven't decided whether I like nltk or spacy better yet
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet, stopwords
stops = stopwords.words('english')
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

# A method to process text in nltk:
# https://pythonhealthcare.org/2018/12/14/101-pre-processing-data-tokenization-stemming-and-removal-of-stop-words/

# same process in spacy
# https://spacy.io/usage/linguistic-features

In [107]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Just don't have time to learn these right now
#from sklearn.base import TransformerMixin
#from sklearn.pipeline import Pipeline

In [183]:
import matplotlib.pyplot as plt
import seaborn as sns

In [80]:
# Adjusting stop words in spacy to not lose a bunch of negatives for the sentiment analysis
for word in [u'nor',u'none',u'not',u'alone',u'no',u'never',u'cannot',u'always']:
    nlp.vocab[word].is_stop = False

In [146]:
def spacyTokenizer(s: str)-> list:
    doc = nlp(s.lower().strip())
    tokens = []
    for token in doc:
        if not token.is_stop and token.is_alpha and token.lemma_ != '-PRON-':
            tokens.append(token.lemma_)
        
    return tokens

# Processing the reviews files

In [229]:
# A function to parse the reviewer information
def parse_reviewer(reviewer):
    # Find name as unique identifier if present
    if reviewer.find(',') != -1:
        name = reviewer[reviewer.find(':')+2:reviewer.find(',')]
    else:
        name = np.NaN
    
    # Find age range as datapoint if present
    if reviewer.find('-') != -1:
        if reviewer.find(',') != -1:
            age = reviewer[reviewer.find(',')+2:reviewer.find(' ', reviewer.find('-'))]
        else:
            age = reviewer[reviewer.find(':')+2:reviewer.find(' ', reviewer.find('-'))]
    else:
        age = np.NaN
        
    # Find gender if present
    if reviewer.find('Male') != -1:
        gender = 'Male'
    elif reviewer.find('Female') != -1:
        gender = 'Female'
    else:
        gender = np.NaN
        
    # Find treatment time
    if reviewer.find('on Treatment') != -1:
        if reviewer.rstrip()[-1] == ')':
            treatment_time = reviewer[reviewer.find('on Treatment for ')+16:reviewer.rfind('(')].strip()
        else:
            treatment_time = reviewer[reviewer.find('on Treatment for ')+16:].rstrip().strip()
    else:
        treatment_time = np.NaN
    
    # Put info in a dictionary that can be made into a dictionary
    reviewer_info = {}
    reviewer_info['Name'] = name
    reviewer_info['Age'] = age
    reviewer_info['Gender'] = gender
    reviewer_info['Length of treatment'] = treatment_time
    
    return reviewer_info

def processReviewerColumn(reviewDF):
    # Parse the reviewer info
    reviewers = []
    for reviewer in reviewDF['reviewer']:
        reviewers.append(parse_reviewer(reviewer))
    reviewersDF = pd.DataFrame(reviewers, index=reviewDF.index)

    # Drop the reviewer column from the original dataframe
    reviewDF = reviewDF.drop(columns=['reviewer'])

    # Add the parsed reviewer info to the original dataframe
    reviewDF = pd.concat([reviewDF, reviewersDF], axis=1)
    
    return reviewDF

In [279]:
import os
def compare_parsedAndEmpty(parsed,empty,file):
    # Making a graph to compare the populations
    fig, ax = plt.subplots(1,2,constrained_layout='True', figsize=(15,5))
    parsed.hist(column='Satisfaction', color='skyblue', ax=ax[0], bins = np.arange(0.75,5.5,0.5),
                  density=True, label='With comment')
    empty.hist(column='Satisfaction', color='orange', ax=ax[0], bins = np.arange(0.75,5.5,0.5),
                      density=True, alpha=0.5, label='No comment')
    ax[0].set_title('Satisfaction  (Empty: {:g}/{:g})'.format(len(empty), (len(empty)+len(parsed))))
    ax[0].legend(loc='best')
    ax[0].set_xlabel('Number of stars')
    ax[0].set_ylabel('% of reviews')

    parsed.hist(column='Effectiveness', color='skyblue', bins = np.arange(0.75,5.5,0.5),
                  ax=ax[1], density=True, label='With comment')
    empty.hist(column='Effectiveness', color='orange', bins = np.arange(0.75,5.5,0.5),
                      ax=ax[1], density=True, alpha=0.5, label='No comment')
    ax[1].set_title('Satisfaction  (Empty: {:g}/{:g})'.format(len(empty), (len(empty)+len(parsed))))
    ax[1].legend(loc='best')
    ax[1].set_xlabel('Number of stars')
    ax[1].set_ylabel('% of reviews')
    
    directory = file[:file.rfind('/')+1] + 'plots/'
    if not glob.glob(directory+'*.png'):
        os.mkdir(directory)
        
    savefile = file[file.rfind('/')+1:].replace('raw_reviews.csv','empty_compare.png')
    fig.savefig(directory+savefile)
    plt.close()

In [280]:
# Grabbing all the files I need to process
files = glob.glob('ProcessedReviews/*/*raw_reviews.csv')
files = [file for file in files if file.find('allconditions') == -1]

In [281]:
# Processing all the files
for file in files:
    # Reading in file
    df = pd.read_csv(file, sep='$', index_col=0)

    # Reseting index to remove the indices that were split by condition separation
    # Dropping that index column
    df = df.reset_index().drop(columns=['index'])

    # Dropping comments that contain no information
    parsedDF = processReviewerColumn(df).dropna(subset=['Comment'])
    parsedDF.to_csv(file.replace('raw','parsed'), sep='$')
    
    # Finding the empty reviews and making them a dataframe
    emptyReviews = pd.DataFrame([df.loc[ind] for ind in df.index if ind not in parsedDF.index])
    if emptyReviews.any().any():
        emptyReviews.to_csv(file.replace('raw','empty'),sep='$')
    
        # Creating some plots to review later
        compare_parsedAndEmpty(parsedDF, emptyReviews, file)