In [55]:
## packages and imports

## outputs and loading
import warnings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings("ignore")
import os
from operator import itemgetter

## dataframe
import pandas as pd
import numpy as np  


## preprocessing
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import CountVectorizer


## add punctuation and some application-specific words
## to stopword list
from nltk.stem import PorterStemmer
porter = PorterStemmer()
from nltk.tokenize import wordpunct_tokenize

## lda
from gensim import corpora
import gensim

## functions
stopwords_standard = set(stopwords.words('english'))
def remove_stop(row, colname, stopword_dict):
    
    string_of_col = str(row[colname])
    try:
        processed_string = " ".join([i for i in wordpunct_tokenize(string_of_col) if 
                        i not in stopword_dict])  ## removed numeric
        return(processed_string)
    except:
        processed_string = "" # to handle data errors where not actually text
        return(processed_string)
    
def processtext(row, colname):
    
    string_of_col = str(row[colname])
    try:
        processed_string = " ".join([porter.stem(i.lower()) for i in wordpunct_tokenize(string_of_col) if 
                        i.lower().isalpha() and len(i) >=3])  
        return(processed_string)
    except:
        processed_string = "" # to handle data errors where not actually text eg someone left blank
        return(processed_string)
    
def create_dtm(list_of_strings, metadata):
    vectorizer = CountVectorizer(lowercase = True)
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names())
    dtm_dense_named_withid = pd.concat([metadata.reset_index(), dtm_dense_named], axis = 1)
    return(dtm_dense_named_withid)




# 1. Analyze free responses from optimizing schools pilot

*Note of caution*: these are based on low N and are meant to be illustrative 

In [13]:
frdata_schools = pd.read_csv("../data/cleaned_fr_pilot.csv")


## Basic preprocessing 

In [16]:
frdata_schools['text_lower'] = frdata_schools.explain_fairness.astype(str).str.lower()

In [20]:
## convert to lowercase and remove stopwords
frdata_schools['text_lower_nostop'] = frdata_schools.apply(remove_stop,
                                    axis = 1,
                                   args = ["text_lower", stopwords_standard])

In [62]:
## remove punctuation, digits, and length2 or less
frdata_schools['text_preprocess'] = frdata_schools.apply(processtext,
                                    axis = 1,
                                   args = ["text_lower_nostop"])

## Example 1: LDA

In [29]:
## store in a list and re-tokenize
all_stemmed_text = frdata_schools.text_preprocess
text_preprocess_tokens = [wordpunct_tokenize(one_row) for one_row 
                         in all_stemmed_text]

## create dictionary (all unique words and counts)
dictionary = corpora.Dictionary(text_preprocess_tokens)

## filter out words that are in almost none or almost all documents
lower_thres = 0.01*frdata_schools.shape[0]
upper_thres = 0.99*frdata_schools.shape[0]
dictionary.filter_extremes(no_below=lower_thres, no_above=upper_thres)

## use the dictionary to create the corpus
corpus = [dictionary.doc2bow(text) for text in text_preprocess_tokens]
num_topics = 10

In [30]:
## estimate
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes=10,
                                          alpha = 'auto',
                                          per_word_topics = True)


In [33]:
topics = ldamodel.print_topics(num_words = 20)
    
import pyLDAvis.gensim as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()
lda_display = gensimvis.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(lda_display)

### Topics not capturing more nuanced moral differences

In [82]:
## see from viz that, for instance, topic 10 is focused on issues of counselor bias, topic 2 on issues with parent
## requests--- find high-probability docs for that topic

## get document-specific topic probabilities
## has element for each document w/ tuples of topic probabilities
all_topics = ldamodel.get_document_topics(corpus, minimum_probability=0.0, per_word_topics=False)


In [93]:
efficient_nostories = frdata_schools.explain_fairness[frdata_schools.explain_fairness.astype(str).str.contains("stories")]
print("For an example text focused on algorithms as efficient, these are the topics:")
print("----------------------------------------------------------------------------")
print(efficient_nostories)
print(all_topics[efficient_nostories.index[0]])
focal_text = efficient_nostories.index

For an example text focused on algorithms as efficient, these are the topics
----------------------------------------------------------------------------
129    I think it would be easier to apply predictive models than learning about each individual student. It would be easier in the sense of using time efficiently so you can create a school program or class that would help more students in a similar situation. It takes too much time to sit down with each student in order to know their individual stories.
Name: explain_fairness, dtype: object
[(0, 0.98026466), (1, 0.0018776644), (2, 0.0019383765), (3, 0.0020405487), (4, 0.0019668536), (5, 0.0019466605), (6, 0.0022311981), (7, 0.0024939054), (8, 0.0023879749), (9, 0.0028521467)]


In [84]:
## see topic 0 is the top topic--- look at other responses with high-probability of that topic
## empty topic dictionary
topic_dict = {i: [] for i in range(len(all_topics[0]))}  

## iterate over docs and append the vector of 
## document-specific probabilities to each key (one topic)
for docID in range(len(all_topics)):
    topic_vector = ldamodel[corpus[docID]]
    for topicID, prob in topic_vector:
        topic_dict[topicID].append([docID, prob])
     
## pull out that topic and pull topic responses
focal_topic = 0
top_0 = topic_dict[0]
indices_top_5_resp = [el[0] for el in 
                                sorted(top_0, key=itemgetter(1))[len(top_0)-5:len(parent_issues)]]


## see that they're either talking about benefits of a predictive model, 
## esp relative to randomness, but not the focal responses ideas about
## efficiency (regardless of its normative desirability)
frdata_schools.explain_fairness[frdata_schools.explain_fairness.index.isin(indices_top_5_resp)]

7      I think that drawing a random name is fair but it may not be the name of a student that will actually benefit from having a mentor. I think an algorithm is also fair and will most likely end up choosing students that will for sure benefit from a mentor.                                                                                                                                                         
60     Drawing students' names randomly would perhaps be the correct way to determine if the counselor is indeed helpful to students and helps them stay in school. However, I think the predictive model/algorithm is probably more ethical because it would provide more opportunities for students who would probably be more susceptible to missing school and ensure more of these kids would have access to a counselor
124    Be an excellent communicator. Being able to communicate ideas, thoughts, and feelings verbally is a trait that can never go unsung as a school counselor. with this i

## Example 2: moral dictionary

In [86]:
## create input file that matches formatting
frdata_schools.explain_fairness.to_csv("fr_forscore.csv",
                                      index = False)

In [88]:
%%bash
emfdscore fr_forscore.csv all-vv.csv bow emfd all vice-virtue

Running eMFDscore
Total number of input texts to be scored: 212
Scoring completed.


Processed: 0 N/A% |                      | Elapsed Time: 0:00:00 ETA:  --:--:--Processed: 202  95% |❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤ | Elapsed Time: 0:00:00 ETA:   0:00:00Processed: 212 100% |❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤| Elapsed Time: 0:00:00 Time:  0:00:00


In [130]:
mfdt_res = pd.read_csv("all-vv.csv")

## pull up results for focal text
## see that it's generally low in all
## but is highest in loyalty,
## somewhat high in virtue
focal_mfdt = mfdt_res.iloc[focal_text]
print("For an example text focused on algorithms as efficient, these are the moral foundation scores:")
print("----------------------------------------------------------------------------")
print(efficient_nostories)
focal_mfdt


For an example text focused on algorithms as efficient, these are the moral foundation scores:
----------------------------------------------------------------------------
129    I think it would be easier to apply predictive models than learning about each individual student. It would be easier in the sense of using time efficiently so you can create a school program or class that would help more students in a similar situation. It takes too much time to sit down with each student in order to know their individual stories.
Name: explain_fairness, dtype: object


Unnamed: 0,care.virtue,fairness.virtue,loyalty.virtue,authority.virtue,sanctity.virtue,care.vice,fairness.vice,loyalty.vice,authority.vice,sanctity.vice,moral_nonmoral_ratio,f_var
129,0.04,0.041613,0.061589,0.025956,0.011745,0.039836,0.02459,0.022727,0.033713,0.056424,4.0,0.000237


In [151]:
## add index as row number to each
answer_only = pd.DataFrame(frdata_schools.explain_fairness.copy())
answer_only['doc'] = answer_only.index
mfdt_res['doc'] = mfdt_res.index

mfdt_wanswer = pd.merge(answer_only,
                       mfdt_res,
                       on = "doc",
                       how = "left")

## shows other answers that score similarly high in virtue 
## but that touch on very different moral concerns
prox_answer = mfdt_wanswer.sort_values(by = 'loyalty.virtue', ascending = False).reset_index()

prox_answer.loc[prox_answer.index.isin(range(prox_answer[prox_answer.doc == focal_text[0]].index[0],
                                        prox_answer[prox_answer.doc == focal_text[0]].index[0]+5)),
            ['explain_fairness', 'loyalty.virtue']]

Unnamed: 0,explain_fairness,loyalty.virtue
75,I think it would be easier to apply predictive models than learning about each individual student. It would be easier in the sense of using time efficiently so you can create a school program or class that would help more students in a similar situation. It takes too much time to sit down with each student in order to know their individual stories.,0.061589
76,I think it would be fair.,0.061347
77,I think it is more fair because students call fall into certain algorithms and models.,0.061264
78,It helps gauge the general state of the class as a whole,0.061202
79,"If initially set up correctly, all students will me measured using the same algorithm. There wouldn't be any chance that one student is more favored than the other other than based on their needs.",0.061062
