In [209]:
## packages and imports

## outputs and loading
import warnings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings("ignore")
import os
from operator import itemgetter
from tabulate import tabulate
from IPython.display import display, HTML

## dataframe
import pandas as pd
import numpy as np  


## preprocessing
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import CountVectorizer


## add punctuation and some application-specific words
## to stopword list
from nltk.stem import PorterStemmer
porter = PorterStemmer()
from nltk.tokenize import wordpunct_tokenize

## lda estimate and visualize
from gensim import corpora
import gensim
import pyLDAvis.gensim as gensimvis
import pyLDAvis

## functions
stopwords_standard = set(stopwords.words('english'))
def remove_stop(row, colname, stopword_dict):
    
    string_of_col = str(row[colname])
    try:
        processed_string = " ".join([i for i in wordpunct_tokenize(string_of_col) if 
                        i not in stopword_dict])  ## removed numeric
        return(processed_string)
    except:
        processed_string = "" # to handle data errors where not actually text
        return(processed_string)
    
def processtext(row, colname):
    
    string_of_col = str(row[colname])
    try:
        processed_string = " ".join([porter.stem(i.lower()) for i in wordpunct_tokenize(string_of_col) if 
                        i.lower().isalpha() and len(i) >=3])  
        return(processed_string)
    except:
        processed_string = "" # to handle data errors where not actually text eg someone left blank
        return(processed_string)
    
def create_dtm(list_of_strings, metadata):
    vectorizer = CountVectorizer(lowercase = True)
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names())
    dtm_dense_named_withid = pd.concat([metadata.reset_index(), dtm_dense_named], axis = 1)
    return(dtm_dense_named_withid)




# 1. Analyze free responses from optimizing schools pilot

*Note of caution*: these are based on low N and are meant to be illustrative 

In [198]:
frdata_schools = pd.read_csv("../data/cleaned_fr_pilot.csv")


## Basic preprocessing 

In [153]:
frdata_schools['text_lower'] = frdata_schools.explain_fairness.astype(str).str.lower()

In [154]:
## convert to lowercase and remove stopwords
frdata_schools['text_lower_nostop'] = frdata_schools.apply(remove_stop,
                                    axis = 1,
                                   args = ["text_lower", stopwords_standard])

In [155]:
## remove punctuation, digits, and length2 or less
frdata_schools['text_preprocess'] = frdata_schools.apply(processtext,
                                    axis = 1,
                                   args = ["text_lower_nostop"])

## Example 1: LDA

In [156]:
## store in a list and re-tokenize
all_stemmed_text = frdata_schools.text_preprocess
text_preprocess_tokens = [wordpunct_tokenize(one_row) for one_row 
                         in all_stemmed_text]

## create dictionary (all unique words and counts)
dictionary = corpora.Dictionary(text_preprocess_tokens)

## filter out words that are in almost none or almost all documents
lower_thres = 0.01*frdata_schools.shape[0]
upper_thres = 0.99*frdata_schools.shape[0]
dictionary.filter_extremes(no_below=lower_thres, no_above=upper_thres)

## use the dictionary to create the corpus
corpus = [dictionary.doc2bow(text) for text in text_preprocess_tokens]
num_topics = 10

In [157]:
## estimate
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes=10,
                                          alpha = 'auto',
                                          per_word_topics = True)


In [158]:
topics = ldamodel.print_topics(num_words = 20)
pyLDAvis.enable_notebook()
lda_display = gensimvis.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(lda_display)

### Topics not capturing more nuanced moral differences

In [159]:
## see from viz that, for instance, topic 10 is focused on issues of counselor bias, topic 2 on issues with parent
## requests

## get document-specific topic probabilities
## has element for each document w/ tuples of topic probabilities
all_topics = ldamodel.get_document_topics(corpus, minimum_probability=0.0, per_word_topics=False)


In [199]:
efficient_nostories = frdata_schools.explain_fairness[frdata_schools.explain_fairness.astype(str).str.contains("stories")]
print("For an example response focused on algorithms as efficient, these are the topic probabilities:")
print("----------------------------------------------------------------------------")
print(efficient_nostories.to_string(index = False))
print(all_topics[efficient_nostories.index[0]])
focal_text = efficient_nostories.index

## get index of top topic 
top_topic = sorted(all_topics[efficient_nostories.index[0]], key = lambda x: x[1])[len(test)-1][0]
print("For this efficiency example, highest-probability topic is topic " + str(top_topic))

For an example response focused on algorithms as efficient, these are the topic probabilities:
----------------------------------------------------------------------------
I think it would be easier to apply predictive models than learning about each individual student. It would be easier in the sense of using time efficiently so you can create a school program or class that would help more students in a similar situation. It takes too much time to sit down with each student in order to know their individual stories.
[(0, 0.001819176), (1, 0.0023839534), (2, 0.0020805404), (3, 0.0029978217), (4, 0.0022326568), (5, 0.0018926469), (6, 0.002496051), (7, 0.0016892517), (8, 0.98022133), (9, 0.0021865289)]
For this efficiency example, highest-probability topic is topic 8


In [217]:
## see topic 0 is the top topic--- look at other responses with high-probability of that topic
## empty topic dictionary
topic_dict = {i: [] for i in range(len(all_topics[top_topic]))}  

## iterate over docs and append the vector of 
## document-specific probabilities to each key (one topic)
for docID in range(len(all_topics)):
    topic_vector = ldamodel[corpus[docID]]
    for topicID, prob in topic_vector:
        topic_dict[topicID].append([docID, prob])
     
## pull out that topic and pull topic responses
docs_fortopic = topic_dict[top_topic]
indices_top_5_resp = [el[0] for el in 
                                sorted(docs_fortopic, key=itemgetter(1))[len(docs_fortopic)-5:len(docs_fortopic)]]


## see that they're either talking about benefits of a predictive model, 
## esp relative to randomness, but not the focal responses ideas about
## efficiency (regardless of its normative desirability)
display(HTML(pd.DataFrame(frdata_schools.loc[frdata_schools.explain_fairness.index.isin(indices_top_5_resp),
                                ['binary_morefair',
                                'explain_fairness',
                                'pol_ideology',
                                'gender']]).to_html(index = \
            False)))


binary_morefair,explain_fairness,pol_ideology,gender
school counselor using a predictive model/algorithm,"The predictive model is more fair because the alternative, where the counselor decides based on parents' requests, means that some kids who need help won't get help because their parents may not be active in their lives enough to go out and request help. It also removes the chances of some kind of personal bias on the part of a counselor clouding their judgment.",Liberal,Male
school counselor using a predictive model/algorithm,I think it would be easier to apply predictive models than learning about each individual student. It would be easier in the sense of using time efficiently so you can create a school program or class that would help more students in a similar situation. It takes too much time to sit down with each student in order to know their individual stories.,Liberal,Female
school counselor using a predictive model/algorithm,"Random drawing for things like this are not the best way to do things due to the fact that a lot of kids don't need or want one on one counseling so they are taking up a spot from someone who actually needs it. Using a 100% algorithm model though is also not completly fair due to the fact that there will be outliners that get skipped and just because a stupid looks like they are ""ok"" on paper, that may not actually be the case in real life. The most fair way to do it would be a combination of the two system with something like 80% picked from model and then maybe 20% done through a stupid signup sheet (if there are too many then randomly draw from that 20%).",Slightly conservative,Male
school counselor using a predictive model/algorithm,Some parents who don't really need it quite as much might send a request and some parents who do need it might be too proud to send a request. with this model it would better determine who needs it more.,Liberal,Female
school counselor using a predictive model/algorithm,Income cutoff could be pretty subjective. Lets say you cut it off at 15000 but tons of students who come from 16000 income are having just as many issues and don't get help. Need to mix a variety of metrics to figure out who needs help.,Extremely Conservative,Male


## Example 2: dictionary-based method using moral foundations theory

In [219]:
## create input file that matches formatting
frdata_schools.explain_fairness.to_csv("fr_forscore.csv",
                                      index = False)


In [220]:
%%bash
emfdscore fr_forscore.csv school_mftscores.csv bow emfd all vice-virtue


Running eMFDscore
Total number of input texts to be scored: 212
Scoring completed.


Processed: 0 N/A% |                      | Elapsed Time: 0:00:00 ETA:  --:--:--Processed: 212 100% |❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤| Elapsed Time: 0:00:00 Time:  0:00:00


In [221]:

## pull up results for focal text
## see that it's generally low in all
## but is highest in loyalty,
## somewhat high in virtue
mfdt_res = pd.read_csv("school_mftscores.csv")
focal_mfdt = mfdt_res.iloc[focal_text]
print("For an example response focused on algorithms as efficient, these are the moral foundation scores:")
print("----------------------------------------------------------------------------")
print(efficient_nostories.to_string(index = False))
focal_mfdt


For an example response focused on algorithms as efficient, these are the moral foundation scores:
----------------------------------------------------------------------------
I think it would be easier to apply predictive models than learning about each individual student. It would be easier in the sense of using time efficiently so you can create a school program or class that would help more students in a similar situation. It takes too much time to sit down with each student in order to know their individual stories.


Unnamed: 0,care.virtue,fairness.virtue,loyalty.virtue,authority.virtue,sanctity.virtue,care.vice,fairness.vice,loyalty.vice,authority.vice,sanctity.vice,moral_nonmoral_ratio,f_var
129,0.04,0.041613,0.061589,0.025956,0.011745,0.039836,0.02459,0.022727,0.033713,0.056424,4.0,0.000237


In [227]:
## add index as row number to each
frdata_schools['doc'] = frdata_schools.index
mfdt_res['doc'] = mfdt_res.index

mfdt_wanswer = pd.merge(frdata_schools,
                       mfdt_res,
                       on = "doc",
                       how = "left")

## shows other answers that score similarly high in virtue 
## but that touch on very different moral concerns
prox_answer = mfdt_wanswer.sort_values(by = 'loyalty.virtue', ascending = False).reset_index()



In [228]:
display(HTML(pd.DataFrame(prox_answer.loc[prox_answer.index.isin(range(prox_answer[prox_answer.doc == focal_text[0]].index[0],
                                        prox_answer[prox_answer.doc == focal_text[0]].index[0]+5)),
            ['binary_morefair',
            'explain_fairness',
            'loyalty.virtue',
            'pol_ideology',
            'gender']]).to_html(index = False)))

binary_morefair,explain_fairness,loyalty.virtue,pol_ideology,gender
school counselor using a predictive model/algorithm,I think it would be easier to apply predictive models than learning about each individual student. It would be easier in the sense of using time efficiently so you can create a school program or class that would help more students in a similar situation. It takes too much time to sit down with each student in order to know their individual stories.,0.061589,Liberal,Female
school counselor using a predictive model/algorithm,I think it would be fair.,0.061347,Liberal,Male
school counselor using a predictive model/algorithm,I think it is more fair because students call fall into certain algorithms and models.,0.061264,Slightly conservative,Male
school counselor using a predictive model/algorithm,It helps gauge the general state of the class as a whole,0.061202,"Moderate, Middle of road",Female
school counselor using a predictive model/algorithm,"If initially set up correctly, all students will me measured using the same algorithm. There wouldn't be any chance that one student is more favored than the other other than based on their needs.",0.061062,Liberal,Female
