In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/Users/CalebPitts/Documents/Files/School/College/19-20-Year/Research/data/bio_data.csv")

In [3]:
df.shape

(169, 1343)

In [4]:
df.head()

Unnamed: 0,randomid,intervention,asia,roster6_21,roster6_29,roster7_2,courseformat,coursecode,departmentnamecode,coursenum,...,wcoursereg4,woactreg31,woactreg32,wcoursesat4,woactsat31,woactsat32,wdistractw5,wwhenw5,wstrategyw5,statusw5
0,1,1.0,1.0,1.0,1.0,1.0,1.0,5005.0,BIO SCI,9B,...,,,,,,,,,,no
1,2,1.0,0.0,1.0,1.0,1.0,1.0,5005.0,BIO SCI,9B,...,,,,,,,,,,no
2,3,1.0,0.0,1.0,1.0,1.0,1.0,5005.0,BIO SCI,9B,...,,,,,,,,,,no
3,4,2.0,0.0,1.0,1.0,1.0,1.0,5005.0,BIO SCI,9B,...,,,,,,,,,,no
4,5,1.0,0.0,1.0,1.0,1.0,1.0,5005.0,BIO SCI,9B,...,,,,,,,,,,no


In [5]:
df.columns

Index(['randomid', 'intervention', 'asia', 'roster6_21', 'roster6_29',
       'roster7_2', 'courseformat', 'coursecode', 'departmentnamecode',
       'coursenum',
       ...
       'wcoursereg4', 'woactreg31', 'woactreg32', 'wcoursesat4', 'woactsat31',
       'woactsat32', 'wdistractw5', 'wwhenw5', 'wstrategyw5', 'statusw5'],
      dtype='object', length=1343)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Columns: 1343 entries, randomid to statusw5
dtypes: float64(1054), int64(1), object(288)
memory usage: 1.7+ MB


In [7]:
object_cols = df.select_dtypes(include='object').columns

## Text Response Extraction Script

In [8]:
def get_data(file_path):
    df = pd.read_csv("{}/bio_data.csv".format(file_path))
    sheets = pd.read_excel('{}/codebook.xlsx'.format(file_path), sheet_name=None)
    
    return df, sheets

In [9]:
def combine_sheets(sheets):
    full_table = pd.DataFrame()
    for name, sheet in sheets.items():
        sheet['sheet'] = name
        full_table = full_table.append(sheet)
    
    return full_table.reset_index(drop=True)

In [10]:
def get_text_response_vars(cb, text_response_cols):
    return cb[cb["Response Values"].isin(text_response_cols)][["Name Stem", "Item"]]

In [11]:
def get_text_response_data(cb, df, trv):
    text_response_vars = trv["Name Stem"].tolist() + ["randomid"]
    
    selected_cols = []
    q_var_items = {}
    for var in text_response_vars:
        found = False
        for col in df.columns:
            if var in col:
                try:
                    q_item = cb.loc[var]["Item"]  # extracts question/item for text response var
                    q_var_items[str(col)] = q_item
                except KeyError:
                    print("key error for {}".format(var))
                selected_cols.append(df[col])
                found = True
        
        if not found:
            print("Did not find response match for {}".format(var))
    
    dftr = pd.DataFrame(selected_cols)
    dftr.drop_duplicates(inplace=True)
    dftr.columns = dftr.loc["randomid"]
    
    return dftr, q_var_items

In [12]:
def match_response_with_question(dftr, q_var_items):
    questions = []
    for var in dftr.index:
        try:
            question = q_var_items[var]
            questions.append(question)
        except KeyError:
            print("Key error encounted when trying to map vars to question items for {}".format(var))
            questions.append("None")
    
    dftr["Item"] = questions
    dftr.sort_index(inplace=True)
    
    return dftr

In [13]:
def identify_valid_text_responses(dftr):
    keep_indexes = []
    clean = dftr.loc[:, dftr.columns != 'Item']
    for index in clean.index:
        for val in clean.loc[index]:
            if not pd.isna(val):
                try:
                    int(val)
                except ValueError:
                    keep_indexes.append(index)  # can't convert value to int, meaning it is a free response, thus we keep this row
                    break
    return keep_indexes

In [14]:
df, sheets = get_data("/Users/CalebPitts/Documents/Files/School/College/19-20-Year/Research/data")  # extract data and codebook

In [15]:
text_response_cols = ["open-ended"]

cb = combine_sheets(sheets)  # get codebook with sheets combined (cb)
trv = get_text_response_vars(cb, text_response_cols)  # get text response variables (trv)
dftr, q_var_items = get_text_response_data(cb.set_index("Name Stem"), df, trv)  # get text responses from data frame (dftr)
dftr = match_response_with_question(dftr, q_var_items)
keep_indexes = identify_valid_text_responses(dftr)

final_responses = dftr[dftr.index.isin(keep_indexes)]
new_order = ["Item"] + final_responses.columns[:-1].tolist()
final_responses = final_responses[new_order]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Did not find response match for firstname
Did not find response match for lastname
Did not find response match for studentid
Did not find response match for email
Did not find response match for phone
Did not find response match for firstname
Did not find response match for lastname
Did not find response match for studentid
Did not find response match for email
Did not find response match for phone
Did not find response match for firstname
Did not find response match for lastname
Did not find response match for studentid
Did not find response match for email
Did not find response match for phonemodel
Did not find response match for phonecarrier
Did not find response match for phonemodel
Did not find response match for phonecarrier
key error for randomid
Key error encounted when trying to map vars to question items for randomid


In [16]:
final_responses.shape

(173, 170)

In [17]:
final_responses.head()

randomid,Item,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,160.0,161.0,162.0,163.0,164.0,165.0,166.0,167.0,168.0,169.0
daddod1,"Name Stem daddo If so, what else did you do...",,,,,,,,,,...,,,,,,,,,,Reading quiz and discussion
daddod2,"Name Stem daddo If so, what else did you do...",,,,,,,,,This morning I read (and am reading! - I've ta...,...,,,,,,,,,,
daddod3,"Name Stem daddo If so, what else did you do...",,,,,,,packing,,,...,,,,,,,,,,
daddod4,"Name Stem daddo If so, what else did you do...",,,,,,,,,,...,,,,,,,,,,
daddod5,"Name Stem daddo If so, what else did you do...",,,,,,,,,,...,,,,Week 3 Quiz,,,Two other courses,,,


In [18]:
final_responses.to_csv("/Users/CalebPitts/Documents/Files/School/College/19-20-Year/Research/DMOL-master/data/clean/open_ended_responses.csv")