In [None]:
# IMPORT NECESSARY LIBRARIES AND MODULES
import numpy as np
import scipy.stats as stats
import pandas as pd
import re
import time
import datetime
import itertools
import tqdm as tqdm

%set_env PYTHONHASHSEED=1

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.neural_network import MLPClassifier

from gensim.models import Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import preprocess_string, strip_multiple_whitespaces
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense

In [None]:
#
# ALL DATA AND PRE-PROCESSING CODE IS UNAVAILBLE TO UNAUTHORIZED USERS DUE TO PRIVACY AND CONFIDENTIALITY RESTRICTIONS
#

In [None]:
# IMPORT RAW ORIGINAL DATA AND ESTABLISH TIMESTAMP FOR ANALYTIC PROCESSES
rawData = pd.read_json('ksu_full.json')
today = re.sub(string=str(datetime.datetime.today()), pattern="\W", repl="")

In [None]:
# REQUISITE DATAFRAME COLUMNS AND CONTENTS
# "segCode" : (Optional) variable manifest from data cleaning stage indicating samples that are suited (0) or not suited (1) for analysis.
# "Start Date" : The datetime the the chat transcript started
# "Question Type" : Labels for question-type categories as labelled by VRS operators
# "READ_1_vs_2" : Binary values of 0 or 1.  Values of 999 used to identify missing data
# "READ_2_vs_3" : Binary values of 0 or 1.  Values of 999 used to identify missing data
# "PatronTextString" : The full text of strictly patron-supplied text drawn from transcript

In [None]:
# CONVERT, FILTER, AND SORT DATA AS NEEDED
rawData['Start Date'] =  pd.to_datetime(rawData['Start Date'],unit='ms')
rawData = rawData[rawData['segCode']==0]
rawData = rawData.sort_index()
rawData = rawData.sort_values(by='Start Date')

In [None]:
# GENERATE PATRON-TEXT SECTIONS FOR ANALYSIS
def getPatronSections(df,breaks):
    st = time.time()
    
    df = df.copy()
    corpus = df['PatronTextString'].str.split(pat="\s{1,}").copy()
    
    for i in corpus.index:
        
        df.loc[i,'TRUNC_5'] = " ".join(corpus.loc[i][:breaks[0]])
        df.loc[i,'TRUNC_10'] = " ".join(corpus.loc[i][:breaks[1]])
        df.loc[i,'TRUNC_20'] = " ".join(corpus.loc[i][:breaks[2]])
    
    et = time.time() - st
    print('{:.2f} : Splitting Patron Lines'.format(et))    
    return(df)  

patronSegmentsOptions = [5,10,20]
rawData = getPatronSections(df=rawData,breaks=patronSegmentsOptions)

In [None]:
# DEFINE REGULAR EXPRESSION PATTERNS FOR 'TAG' MODELLING PARAMETER
manualTags = [
    ['tagURL',[
        re.escape('amazon.com'),
        re.escape('newfirstsearch'),
        re.escape('galegroup'),
        re.escape('ingentaconnect.com'),
        re.escape('proquest.com'),
        re.escape('ncbi.nlm.nih.gov'),
        re.escape('sciencedirect.com'),
        re.escape('springer.com'),
        re.escape('tandfonline.com'),
        re.escape('webofknowledge'),
        re.escape('wiley.com'),
        re.escape('books.google'),
        re.escape('google.com'),

        re.escape('apps.lib.k-state.edu/databases'),

        re.escape('er.lib.ksu.edu'),
        re.escape('er.lib.k-state.edu'),

        re.escape('getit.lib.ksu.edu'),
        re.escape('getit.lib.k-state.edu'),

        re.escape('guides.lib.ksu.edu'),
        re.escape('guides.lib.k-state.edu'),

        re.escape('catalog.lib.ksu.edu'),
        re.escape('catalog2.lib.ksu.edu'),
        re.escape('catalog.lib.k-state.edu'),
        re.escape('catalog2.lib.k-state.edu'),

        re.escape('primo.hosted.exlibrisgroup.com'),
        re.escape('na02.alma.exlibrisgroup'),

        re.escape('searchit.lib.ksu.edu'),
        re.escape('searchit.lib.k-state.edu'),

        re.escape('lib.k-state.edu'),
        re.escape('lib.k-state.edu'),

        re.escape('doi.org'),

        re.escape('http'),
        re.escape('www.'),]
    ],
    
    ['tagPRINTING',[
        'color print',
        'colored print',
        'print in color',
        'print something in color',
        '\Win color\W',
        'cat cash',
        'printer',
        '(?<!3D\s)\bprinting',
        'double.{1}sided',
        'catcash',
        'cat cash',
        'add money',]
    ],    
    
    ['tagSCANNER',[
        'scanner',
        '\Wscan\W',]
    ],      
    
    ['tagHOURS',[
        'open 24/7',
        'what time',
        'the hours',
        'opens{0,1}\W',
        'will be open',
        'summer hours',
        'library hours',]
    ],    
    
    
    ['tagLIBMATHPHYS',[
        re.escape('Math/Physics Library'),
        re.escape('math and physics library'),
        re.escape('Math Physic library'),
        re.escape('math/physics library'),
        re.escape('maths/phys library'),
        re.escape('math & phys library'),
        re.escape('math phys library'),]
    ],
    
    ['tagLIBWEIGEL',[
        'weigel',
        'wiegel',]
    ],
    
    ['tagLIBVETMED',[
        'vet med',
        'vetmed',]
    ],
    
    ['tagLIBHALE',[
        'Hale Library',
        '(?<!help\s)hale',]
    ],
    
    ['tagLIBSTACKS',[
        'Library Stacks',
        'the stacks',
        'in Stacks',]
    ],
    
    ['tagTEXTBOOKS',[
        'the reserve',
        'on reserve',
        'course reserve',
        'reserve textbook',
        'have a specific textbook',
        'have the textbook',
        'have textbook',
        'this textbook',
        'this text book',]
    ],
    
    ['tagQUIET',[
        'quite loud',
        'super loud',
        'really loud',
        'very loud',
        'stop talking',
        'talking on',
        'music loud',
        'loud',
        'talking very',
        'talking extremely',
        'talking loud',
        'quiet floor',
        'Quiet Zone',
        'quiet floors',
        'floor to be quiet',
        'whisper quietly',
        'be quiet',
        'floor to be quiet',]
    ],
    
    
    ['tagLIBLOCATION',[
        'first floor',
        '1st floor',
        'second floor',
        '2nd floor',
        'third floor',
        '3rd floor',
        'fourth floor',
        '4t floor',
        'fifth floor',
        '5th floor',
        'hemisphere room',
        'Harry Potter room',
        'the hemi',]
    ],
    
    ['tagARTICLES',[
        'peer.{,1}review',
        'journal article',
        'scholarly article',
        'scholarly journal',
        'peer reviewed',
        re.escape('peer-reviewed'),
        'peerreviewed',
        'scholarly',
        'articles',]
    ],
    
    ['tagEVIDENCEBASED',[
        'evidence.based',
        'kinesiology',]
    ],
    
    ['tagJUVENILE',[
        'juv lit section',
        'Juvenile Literature',
        re.escape("juv. lit"),
        "children'{0,1}s collection",
        "children'{0,1}s lit",
        "children'{0,1}s stor",
        re.escape("children's boooks"),
        '(?<!Germany on English )children'{0,1}s book',
        re.escape("children's picture"),
        'picture book',]
    ],
    
    
    ['tagCURRICULUM',[
        'curriculum materials',
        'curriculum books',]
    ],
    
    
    ['tagKNOWNITEMARTICLE',[
        'doi\W\s{,1}\S+',
        'doi:{0,1}\s{0,1}\d\S+',
        'this article',
        'this\s\w+\sarticle',
        'this paper',
        'doi\.\S+',
        'doi:{0,1}\s{0,1}\d\S+',
        'doi\.org\S+',]
    ],
    
    
    ['tagKNOWNITEMBOOK',[
        '[a-z]{1,2}\d{2,4}\s{0,1}\.[a-z]\d{1,}',
        'this book',]
    ],
    
    
    
    ['tagREFERENCE',[
        'articles{0,1}\sabout',
        'books{0,1}\sabout',
        'subject',
        'topic',
        'a paper on',
        'help me find an{0,1}',]
    ],
]

In [None]:
# DEFINE FUNCTIONS NECESSARY FOR EXPERMINTAL MODELLING PROCESSES
def getManualTags(df,manualTagsList,section):
    st = time.time()
    
    df = df.copy()
    df['manualTags'] = [[] for i in range(df.shape[0])] 
    
    
    taglist = pd.Series(dtype="object")
    for i in df.index:
        tags = []
        for k in manualTagsList:
            if k[0] in tags:
                continue
            for m in k[1]:
                if re.search(pattern=m,flags=re.IGNORECASE,string=df.loc[i,section]):
                    tags.append(k[0]) 

        tags = list(np.unique(tags))
        taglist.loc[i] = tags
    df.loc[taglist.index,'manualTags'] = taglist

    
    et = time.time() - st
    print('{:.2f} : Getting Manual Tags'.format(et))
    return(df)

In [None]:
def getTextsTokens(corpus,model):
    st = time.time()
    
    corpus = corpus.copy()
    CUSTOM_FILTERS = [lambda x: x.lower(),
                  strip_multiple_whitespaces,
                 ]
    
    tokenizedTexts = [preprocess_string(x, CUSTOM_FILTERS) for x in corpus]
    tokenizedTexts = pd.Series(tokenizedTexts,index=corpus.index)
    
    gensim_dictionary = Dictionary(tokenizedTexts)
    
    et = time.time() - st
    print('{:.2f} : Tokenizing Texts'.format(et))
    return(tokenizedTexts,gensim_dictionary)

In [None]:
def getTFIDFlimited(df, section, truncate, controlVocab, model):
    st = time.time()
    df = df.copy()
    texts = df.loc[:,section].copy()
    
    texts = texts.str.lower().copy()
    fullVocab = CountVectorizer(token_pattern=r"(?u)\b\w{1,}\b").fit(texts).get_feature_names()
        
    if truncate[0] == True:
        count = CountVectorizer(
            token_pattern=r"(?u)\b\w{3,}\b",
            min_df=truncate[1],
        )
    elif truncate[0] == False:
        count = CountVectorizer(
            token_pattern=r"(?u)\b\w{1,}\b",
        )
    
    countDF = pd.DataFrame(count.fit_transform(texts).todense(),
                           index=df.index,
                           columns=count.get_feature_names())
        
    tfidf = TfidfTransformer()
    tfidfDF = pd.DataFrame(tfidf.fit_transform(countDF).todense(),
                           index=df.index,
                           columns=count.get_feature_names()) 
    
    tfidfList = tfidfDF.mean().sort_values(ascending=False)
    
    
    if truncate[0] == True:
        vocab = list(tfidfList.index[:truncate[2]])
        remove = list(set(vocab) ^ set(fullVocab))
        
    else:
        vocab = list(tfidfList.index)
        remove = list(set(vocab) ^ set(fullVocab))
        
    vocab = vocab + controlVocab
    vocab = set(list(np.unique(vocab)))
        
    newTexts = pd.Series(dtype="object")
    for i in texts.index:
        k = re.split(string=texts.loc[i],pattern="\W")
        j = []
        for m in k:
            if m in vocab:
                j.append(m)
                
        newTexts.loc[i] = " ".join(j)
    
    tokenizedTexts,gendict = getTextsTokens(corpus=newTexts,model=model)    
    
    df.loc[newTexts.index,'filteredTexts'] = newTexts
    df.loc[tokenizedTexts.index,'tokenizedTexts'] = tokenizedTexts
    
    et = time.time() - st
    print('{:.2f} : Truncating by TFIDF, maybe'.format(et))              
    return(df,countDF,tfidfDF,tfidfList,remove,vocab,gendict)

In [None]:
def doc2vecModel(df,random,vecs,tagState):
    st = time.time()
    df = df.copy()
    
    if vecs == "D2V_75":
        vector = 75
    if vecs == "D2V_150":
        vector = 150
        
    rand = random
    %set_env PYTHONHASHSEED=rand
    
    if tagState:
        documentsTrain = [TaggedDocument(df.loc[i,'tokenizedTexts'], ( [str(df.loc[i,'Id'])]+df.loc[i,'manualTags'])) for i in df.index]
    else:
        documentsTrain = [TaggedDocument(df.loc[i,'tokenizedTexts'], ( [str(df.loc[i,'Id'])] ) ) for i in df.index]
    
    iterations = 100
    
    %set_env PYTHONHASHSEED=rand
    model = Doc2Vec(
        documentsTrain,
        dm=0,
        dbow_words=1,
        vector_size=vector, 
        window=5, 
        min_count=1, 
        workers=1,
        seed=rand,
        epocs=iterations,
        hs=1,
        negative=0
               )
    
    %set_env PYTHONHASHSEED=rand
    indic = []
    dat = []
    for i in df.index:
        indic.append(i)
        dat.append(model.infer_vector((df.loc[i,'tokenizedTexts']), steps=iterations))
        
    vectorDF = pd.DataFrame(dat,index=indic)
    
    et = time.time() - st
    print('{:.2f} : Getting D2V Model / DF'.format(et))
    return(model, vectorDF)    

In [None]:
def trainModel(df,reps,targetLabels,random):
    st = time.time()
    df = df.copy()
    reps = reps
    rand = random
    %set_env PYTHONHASHSEED=rand
    
    labels = df.loc[df[targetLabels]!=999,targetLabels].astype(int).copy()
    
    indices = labels.index
    X = reps.loc[indices,:]
    y = labels
    
    neural_model = MLPClassifier(hidden_layer_sizes=(10,),max_iter=100,random_state=rand)
    neural_model.fit(X,y)
    yprob = neural_model.predict_proba(X)[:,1]
    yprob_pred = neural_model.predict(X)
    RocAucScore = roc_auc_score(y_true=y,y_score=yprob)
    AccScore = accuracy_score(y_true=y,y_pred=yprob_pred)
    
    
    listIndex = []
    notIndex = []
    
    for i in df.loc[labels.index,].index:
        if len(df.loc[i,'manualTags']) > 0:
            listIndex.append(int(i))
        else:
            notIndex.append(int(i))

    Xlist = representation.loc[listIndex,:]
    y_prob_list = neural_model.predict_proba(Xlist)[:,1]
    yprob_pred_list = neural_model.predict(Xlist)        

    RocAucScore_list = roc_auc_score(y_true=labels.loc[listIndex],y_score=y_prob_list)
    AccScore_list = accuracy_score(y_true=labels.loc[listIndex],y_pred=yprob_pred_list)
    df.loc[listIndex,'PredictProbList'] = y_prob_list
    df.loc[listIndex,'PredictPredList'] = yprob_pred_list

    Xlist = representation.loc[notIndex,:]
    y_prob_list = neural_model.predict_proba(Xlist)[:,1]
    yprob_pred_list = neural_model.predict(Xlist)

    RocAucScore_NOTlist = roc_auc_score(y_true=labels.loc[notIndex],y_score=y_prob_list)
    AccScore_NOTlist = accuracy_score(y_true=labels.loc[notIndex],y_pred=yprob_pred_list)
    df.loc[notIndex,'PredictProbNOTList'] = y_prob_list
    df.loc[notIndex,'PredictPredNOTList'] = yprob_pred_list  
    
    
    
    df.loc[labels.index,'PredictProb'] = yprob
    df.loc[labels.index,'PredictPred'] = yprob_pred
    
    et = time.time() - st
    print('{:.2f} : Running MLP Fit and Eval'.format(et))
    return(neural_model,RocAucScore,RocAucScore_list,RocAucScore_NOTlist,AccScore,AccScore_list,AccScore_NOTlist,len(listIndex),len(notIndex),df.loc[labels.index,:],X)

In [None]:
def testModel(df,targetLabels,d2vModel,dictionary,neural_model):
    st = time.time()
    
    df = df.copy()
    
    labels = df.loc[df[targetLabels]!=999,targetLabels].astype(int).copy()
    df_test = df.loc[labels.index,].copy()
    
   
    indic = []
    dat = []
    for i in df_test.index:
        indic.append(i)
#             dat.append(d2vModel.infer_vector((df_test.loc[i,'tokenizedTexts']+df_test.loc[i,'manualTags']), steps=100))
        dat.append(d2vModel.infer_vector((df_test.loc[i,'tokenizedTexts']), steps=100))

    representation = pd.DataFrame(dat,index=indic)

    X = representation
    yTest_prob = neural_model.predict_proba(X)[:,1]
    yTest_pred = neural_model.predict(X)
    RocAucScore = roc_auc_score(y_true=labels,y_score=yTest_prob)
    AccScore = accuracy_score(y_true=labels,y_pred=yTest_pred)
    
    listIndex = []
    notIndex = []
    
    for i in df.loc[labels.index,].index:
        if len(df.loc[i,'manualTags']) > 0:
            listIndex.append(int(i))
        else:
            notIndex.append(int(i))

    Xlist = representation.loc[listIndex,:]
    y_prob_list = neural_model.predict_proba(Xlist)[:,1]
    yprob_pred_list = neural_model.predict(Xlist)        

    RocAucScore_list = roc_auc_score(y_true=labels.loc[listIndex],y_score=y_prob_list)
    AccScore_list = accuracy_score(y_true=labels.loc[listIndex],y_pred=yprob_pred_list)
    df.loc[listIndex,'PredictProbList'] = y_prob_list
    df.loc[listIndex,'PredictPredList'] = yprob_pred_list



    Xlist = representation.loc[notIndex,:]
    y_prob_list = neural_model.predict_proba(Xlist)[:,1]
    yprob_pred_list = neural_model.predict(Xlist)

    RocAucScore_NOTlist = roc_auc_score(y_true=labels.loc[notIndex],y_score=y_prob_list)
    AccScore_NOTlist = accuracy_score(y_true=labels.loc[notIndex],y_pred=yprob_pred_list)
    df.loc[notIndex,'PredictProbNOTList'] = y_prob_list
    df.loc[notIndex,'PredictPredNOTList'] = yprob_pred_list
    
    
    
    
    df.loc[labels.index,'PredictProb'] = yTest_prob
    df.loc[labels.index,'PredictPred'] = yTest_pred
    
    et = time.time() - st
    print('{:.2f} : Testing Model with Holdout Data'.format(et))
    return(RocAucScore,RocAucScore_list,RocAucScore_NOTlist,AccScore,AccScore_list,AccScore_NOTlist,len(listIndex),len(notIndex),df.loc[labels.index,:],X)    

In [None]:
def prepareTestingData(df,vocab,section):
    st = time.time()
    
    df = df.copy()    
    df['DocTags'] = [[] for i in range(df.shape[0])] 
    
    
    splitStrings = df.loc[:,section].str.lower()
    splitStrings = splitStrings.str.split("\W")
    
    testTokens = pd.Series(dtype="object")
    for i in splitStrings.index:
        j = []
        for m in splitStrings.loc[i]:
            if m in vocab:
                j.append(m)

        testTokens.loc[i] = j
    df.loc[testTokens.index,'tokenizedTexts'] = testTokens
    
    
    et = time.time() - st
    print('{:.2f} : Preparing Test Holdout Data'.format(et))
    return(df)

In [None]:
# DEFINE FUNCTIONS NECESSARY FOR EVALUATION OF MODEL PERFORMANCE ON DATA SUBSETS
qtypes = [
    'Reference',
    'Reserves',
    'Technical',
    'Circulation',
    'Misc',
    'Building',
    'Directional', 
    'KREx', 
    'ResearchConsultation',
    'NewPrairiePress', 
    'KAPI', 
    'Copyright', 
    'Unknown',
]

tag_labels = [
    "tagURL",
    "tagPRINTING",
    "tagSCANNER",
    "tagHOURS",
    "tagLIBMATHPHYS",
    "tagLIBWEIGEL",
    "tagLIBVETMED",
    "tagLIBHALE",
    "tagLIBSTACKS",
    "tagTEXTBOOKS",
    "tagQUIET",
    "tagLIBLOCATION",
    "tagARTICLES",
    "tagEVIDENCEBASED",
    "tagJUVENILE",
    "tagCURRICULUM",
    "tagKNOWNITEMARTICLE",
    "tagKNOWNITEMBOOK",
    "tagREFERENCE",
]

def ROCAUC(df, col, labelledsection):   
    labelledsection = labelledsection
    df = df.copy()    
    ytrue = df.loc[df[col]==1,labelledSection]
    yprob = df.loc[df[col]==1,'PredictProb'] 
    ypred = df.loc[df[col]==1,'PredictPred']
    
    if len(ytrue.unique()) == 1:
        try:
            rocauc = None
            acc = accuracy_score(y_true=ytrue,y_pred=ypred)            
        except:
            rocauc = None
            acc = None
    else:
        try:
            rocauc = roc_auc_score(y_true=ytrue,y_score=yprob)
            acc = accuracy_score(y_true=ytrue,y_pred=ypred)
        except:
            rocauc = None
            acc = None
    return(rocauc, acc)


def getFocusedROCAUC(df_train,df_test,qtypes,tag_labels,labelledsection):
    st = time.time()
    
    qtypes = qtypes
    tag_labels = tag_labels
    labelledsection = labelledsection
    
    df_train = df_train.copy()   
    df_test = df_test.copy() 
    
    markers = pd.DataFrame(index=df_train.index, columns=(qtypes+tag_labels))
    markers = markers.fillna(0)
    df_train = pd.merge(df_train, markers, left_index=True, right_index=True, how="outer")
    
    markers = pd.DataFrame(index=df_test.index, columns=(qtypes+tag_labels))
    markers = markers.fillna(0)
    df_test = pd.merge(df_test, markers, left_index=True, right_index=True, how="outer")
    
    for i in qtypes:
        df_train.loc[df_train["Question Type"]==i,i] = 1
        df_test.loc[df_test["Question Type"]==i,i] = 1
        
    
    for i in df_train.index:
        if len(df_train.loc[i,"manualTags"])==0:
            continue
        else:
            for k in df_train.loc[i,"manualTags"]:
                df_train.loc[i,k] = 1
        
    for i in df_test.index:
        if len(df_test.loc[i,"manualTags"])==0:
            continue
        else:
            for k in df_test.loc[i,"manualTags"]:
                df_test.loc[i,k] = 1
                
    
    tr_Reference = ROCAUC(df = df_train, col = "Reference", labelledsection = labelledsection)
    tr_Reserves = ROCAUC(df = df_train, col = "Reserves", labelledsection = labelledsection)
    tr_Technical = ROCAUC(df = df_train, col = "Technical", labelledsection = labelledsection)
    tr_Circulation = ROCAUC(df = df_train, col = "Circulation", labelledsection = labelledsection)
    tr_Misc = ROCAUC(df = df_train, col = "Misc", labelledsection = labelledsection)
    tr_Building = ROCAUC(df = df_train, col = "Building", labelledsection = labelledsection)
    tr_Directional = ROCAUC(df = df_train, col = "Directional", labelledsection = labelledsection)
    tr_KREx = ROCAUC(df = df_train, col = "KREx", labelledsection = labelledsection)
    tr_ResearchConsultation = ROCAUC(df = df_train, col = "ResearchConsultation", labelledsection = labelledsection)
    tr_NewPrairiePress = ROCAUC(df = df_train, col = "NewPrairiePress", labelledsection = labelledsection)
    tr_KAPI = ROCAUC(df = df_train, col = "KAPI", labelledsection = labelledsection)
    tr_Copyright = ROCAUC(df = df_train, col = "Copyright", labelledsection = labelledsection)
    tr_Unknown = ROCAUC(df = df_train, col = "Unknown", labelledsection = labelledsection)
    tr_tagURL = ROCAUC(df = df_train, col = "tagURL", labelledsection = labelledsection)
    tr_tagPRINTING = ROCAUC(df = df_train, col = "tagPRINTING", labelledsection = labelledsection)
    tr_tagSCANNER = ROCAUC(df = df_train, col = "tagSCANNER", labelledsection = labelledsection)
    tr_tagHOURS = ROCAUC(df = df_train, col = "tagHOURS", labelledsection = labelledsection)
    tr_tagLIBMATHPHYS = ROCAUC(df = df_train, col = "tagLIBMATHPHYS", labelledsection = labelledsection)
    tr_tagLIBWEIGEL = ROCAUC(df = df_train, col = "tagLIBWEIGEL", labelledsection = labelledsection)
    tr_tagLIBVETMED = ROCAUC(df = df_train, col = "tagLIBVETMED", labelledsection = labelledsection)
    tr_tagLIBHALE = ROCAUC(df = df_train, col = "tagLIBHALE", labelledsection = labelledsection)
    tr_tagLIBSTACKS = ROCAUC(df = df_train, col = "tagLIBSTACKS", labelledsection = labelledsection)
    tr_tagTEXTBOOKS = ROCAUC(df = df_train, col = "tagTEXTBOOKS", labelledsection = labelledsection)
    tr_tagQUIET = ROCAUC(df = df_train, col = "tagQUIET", labelledsection = labelledsection)
    tr_tagLIBLOCATION = ROCAUC(df = df_train, col = "tagLIBLOCATION", labelledsection = labelledsection)
    tr_tagARTICLES = ROCAUC(df = df_train, col = "tagARTICLES", labelledsection = labelledsection)
    tr_tagEVIDENCEBASED = ROCAUC(df = df_train, col = "tagEVIDENCEBASED", labelledsection = labelledsection)
    tr_tagJUVENILE = ROCAUC(df = df_train, col = "tagJUVENILE", labelledsection = labelledsection)
    tr_tagCURRICULUM = ROCAUC(df = df_train, col = "tagCURRICULUM", labelledsection = labelledsection)
    tr_tagKNOWNITEMARTICLE = ROCAUC(df = df_train, col = "tagKNOWNITEMARTICLE", labelledsection = labelledsection)
    tr_tagKNOWNITEMBOOK = ROCAUC(df = df_train, col = "tagKNOWNITEMBOOK", labelledsection = labelledsection)
    tr_tagREFERENCE = ROCAUC(df = df_train, col = "tagREFERENCE", labelledsection = labelledsection)

    ts_Reference = ROCAUC(df = df_test, col = "Reference", labelledsection = labelledsection)
    ts_Reserves = ROCAUC(df = df_test, col = "Reserves", labelledsection = labelledsection)
    ts_Technical = ROCAUC(df = df_test, col = "Technical", labelledsection = labelledsection)
    ts_Circulation = ROCAUC(df = df_test, col = "Circulation", labelledsection = labelledsection)
    ts_Misc = ROCAUC(df = df_test, col = "Misc", labelledsection = labelledsection)
    ts_Building = ROCAUC(df = df_test, col = "Building", labelledsection = labelledsection)
    ts_Directional = ROCAUC(df = df_test, col = "Directional", labelledsection = labelledsection)
    ts_KREx = ROCAUC(df = df_test, col = "KREx", labelledsection = labelledsection)
    ts_ResearchConsultation = ROCAUC(df = df_test, col = "ResearchConsultation", labelledsection = labelledsection)
    ts_NewPrairiePress = ROCAUC(df = df_test, col = "NewPrairiePress", labelledsection = labelledsection)
    ts_KAPI = ROCAUC(df = df_test, col = "KAPI", labelledsection = labelledsection)
    ts_Copyright = ROCAUC(df = df_test, col = "Copyright", labelledsection = labelledsection)
    ts_Unknown = ROCAUC(df = df_test, col = "Unknown", labelledsection = labelledsection)
    ts_tagURL = ROCAUC(df = df_test, col = "tagURL", labelledsection = labelledsection)
    ts_tagPRINTING = ROCAUC(df = df_test, col = "tagPRINTING", labelledsection = labelledsection)
    ts_tagSCANNER = ROCAUC(df = df_test, col = "tagSCANNER", labelledsection = labelledsection)
    ts_tagHOURS = ROCAUC(df = df_test, col = "tagHOURS", labelledsection = labelledsection)
    ts_tagLIBMATHPHYS = ROCAUC(df = df_test, col = "tagLIBMATHPHYS", labelledsection = labelledsection)
    ts_tagLIBWEIGEL = ROCAUC(df = df_test, col = "tagLIBWEIGEL", labelledsection = labelledsection)
    ts_tagLIBVETMED = ROCAUC(df = df_test, col = "tagLIBVETMED", labelledsection = labelledsection)
    ts_tagLIBHALE = ROCAUC(df = df_test, col = "tagLIBHALE", labelledsection = labelledsection)
    ts_tagLIBSTACKS = ROCAUC(df = df_test, col = "tagLIBSTACKS", labelledsection = labelledsection)
    ts_tagTEXTBOOKS = ROCAUC(df = df_test, col = "tagTEXTBOOKS", labelledsection = labelledsection)
    ts_tagQUIET = ROCAUC(df = df_test, col = "tagQUIET", labelledsection = labelledsection)
    ts_tagLIBLOCATION = ROCAUC(df = df_test, col = "tagLIBLOCATION", labelledsection = labelledsection)
    ts_tagARTICLES = ROCAUC(df = df_test, col = "tagARTICLES", labelledsection = labelledsection)
    ts_tagEVIDENCEBASED = ROCAUC(df = df_test, col = "tagEVIDENCEBASED", labelledsection = labelledsection)
    ts_tagJUVENILE = ROCAUC(df = df_test, col = "tagJUVENILE", labelledsection = labelledsection)
    ts_tagCURRICULUM = ROCAUC(df = df_test, col = "tagCURRICULUM", labelledsection = labelledsection)
    ts_tagKNOWNITEMARTICLE = ROCAUC(df = df_test, col = "tagKNOWNITEMARTICLE", labelledsection = labelledsection)
    ts_tagKNOWNITEMBOOK = ROCAUC(df = df_test, col = "tagKNOWNITEMBOOK", labelledsection = labelledsection)
    ts_tagREFERENCE = ROCAUC(df = df_test, col = "tagREFERENCE", labelledsection = labelledsection)   
    
    
    et = time.time() - st
    print('{:.2f} : Getting Focused ROC-AUC Scores'.format(et))
    return(df_train, df_test, ts_tagREFERENCE, ts_tagKNOWNITEMBOOK, ts_tagKNOWNITEMARTICLE, ts_tagCURRICULUM, ts_tagJUVENILE, ts_tagEVIDENCEBASED, ts_tagARTICLES, ts_tagLIBLOCATION, ts_tagQUIET, ts_tagTEXTBOOKS, ts_tagLIBSTACKS, ts_tagLIBHALE, ts_tagLIBVETMED, ts_tagLIBWEIGEL, ts_tagLIBMATHPHYS, ts_tagHOURS, ts_tagSCANNER, ts_tagPRINTING, ts_tagURL, ts_Unknown, ts_Copyright, ts_KAPI, ts_NewPrairiePress, ts_ResearchConsultation, ts_KREx, ts_Directional, ts_Building, ts_Misc, ts_Circulation, ts_Technical, ts_Reserves, ts_Reference, tr_tagREFERENCE, tr_tagKNOWNITEMBOOK, tr_tagKNOWNITEMARTICLE, tr_tagCURRICULUM, tr_tagJUVENILE, tr_tagEVIDENCEBASED, tr_tagARTICLES, tr_tagLIBLOCATION, tr_tagQUIET, tr_tagTEXTBOOKS, tr_tagLIBSTACKS, tr_tagLIBHALE, tr_tagLIBVETMED, tr_tagLIBWEIGEL, tr_tagLIBMATHPHYS, tr_tagHOURS, tr_tagSCANNER, tr_tagPRINTING, tr_tagURL, tr_Unknown, tr_Copyright, tr_KAPI, tr_NewPrairiePress, tr_ResearchConsultation, tr_KREx, tr_Directional, tr_Building, tr_Misc, tr_Circulation, tr_Technical, tr_Reserves, tr_Reference)

In [None]:
# ESTABLISH EXPERIMENTAL DESIGN STRUCTURE AND SPECIFYING MODELLING PARAMETERS



# RAND
randomOptions = list(np.arange(0,20,1))

# SPLIT
testingSplits = [
    2000,
]

# TAG
manualTagOptions = [
    True,
    False,
]

# DICT
dictOptions = [    
    (False, 1, 300000),
    (True, 2, 3000),
]

# TRUNC
patronSectionOptions = [
    'TRUNC_10',
    'TRUNC_20',
]

# READ
labelledSectionOptions = [
    'READ_1_vs_2',
    'READ_2_vs_3',
]

# D2V
modelOptions = [
    'D2V_75',
    'D2V_150',
]


optionsList = [
    randomOptions,
    manualTagOptions,
    dictOptions,
    patronSectionOptions,
    labelledSectionOptions,
    modelOptions,
]

optionsLen = [len(x) for x in optionsList]

testtotal = np.prod(optionsLen)
iterationcounter = 0

parameterList = []

In [None]:
# EXECUTE STEP-WISE CONSTRUCTION AND EVALUATION OF EVERY MODEL
for z in tqdm.tqdm(randomOptions):
    randomSeed=z

    for s in testingSplits:
        split = s
        testsplit = rawData.shape[0]-split
        trainData = rawData[:testsplit]
        testData = rawData[testsplit:]        
        trainShape = trainData.shape[0]
        testShape = testData.shape[0]

        for e in patronSectionOptions:
            patronSection=e 
            for f in manualTagOptions:
                getTags = f
                trainDataTagged = getManualTags(df=trainData,
                                                manualTagsList=manualTags,
                                                section=patronSection,
                                               )
                testDataTagged = getManualTags(df=testData,
                                               manualTagsList=manualTags,
                                               section=patronSection,
                                              )

                for g in labelledSectionOptions:
                    labelledSection=g
                    
                    for h in modelOptions:
                        model=h              

                        for m in dictOptions:
                            truncate=m 
                            start_time = time.time()
                            trainDataTRUNC,countDF,tfidfDF,tfidfList,remove,vocab,gendict = getTFIDFlimited(
                                df=trainDataTagged,
                                truncate=truncate,
                                section=patronSection,
                                controlVocab=customVocab,
                                model=model,
                            )
                            d2vModel, representation = doc2vecModel(
                                df=trainDataTRUNC,
                                vecs=h,
                                random=randomSeed,
                                tagState=getTags
                            )
                            MLPmodel,trainAUC,trainAUClist,trainAUCNOTlist,trainAccScore,trainAccScore_list,trainAccScore_NOTlist,trainLlen,trainNLlen,trainDataFinal,trainFinalRep = trainModel(
                                df=trainDataTRUNC,
                                reps=representation,
                                targetLabels=labelledSection,
                                random=randomSeed,
                            )
                            testDataPrepped = prepareTestingData(
                                df=testDataTagged,
                                vocab=vocab,
                                section=patronSection,
                            )
                            testAUC,testAUClist,testAUCNOTlist,testAccScore,testAccScore_list,testAccScore_NOTlist,testLlen,testNLlen,testDataFinal,testFinalRep = testModel(
                                df=testDataPrepped,
                                targetLabels=labelledSection,
                                d2vModel=d2vModel,
                                dictionary=gendict,
                                neural_model=MLPmodel,
                            )            
                            trainDataFinal, testDataFinal, ts_tagREFERENCE, ts_tagKNOWNITEMBOOK, ts_tagKNOWNITEMARTICLE, ts_tagCURRICULUM, ts_tagJUVENILE, ts_tagEVIDENCEBASED, ts_tagARTICLES, ts_tagLIBLOCATION, ts_tagQUIET, ts_tagTEXTBOOKS, ts_tagLIBSTACKS, ts_tagLIBHALE, ts_tagLIBVETMED, ts_tagLIBWEIGEL, ts_tagLIBMATHPHYS, ts_tagHOURS, ts_tagSCANNER, ts_tagPRINTING, ts_tagURL, ts_Unknown, ts_Copyright, ts_KAPI, ts_NewPrairiePress, ts_ResearchConsultation, ts_KREx, ts_Directional, ts_Building, ts_Misc, ts_Circulation, ts_Technical, ts_Reserves, ts_Reference, tr_tagREFERENCE, tr_tagKNOWNITEMBOOK, tr_tagKNOWNITEMARTICLE, tr_tagCURRICULUM, tr_tagJUVENILE, tr_tagEVIDENCEBASED, tr_tagARTICLES, tr_tagLIBLOCATION, tr_tagQUIET, tr_tagTEXTBOOKS, tr_tagLIBSTACKS, tr_tagLIBHALE, tr_tagLIBVETMED, tr_tagLIBWEIGEL, tr_tagLIBMATHPHYS, tr_tagHOURS, tr_tagSCANNER, tr_tagPRINTING, tr_tagURL, tr_Unknown, tr_Copyright, tr_KAPI, tr_NewPrairiePress, tr_ResearchConsultation, tr_KREx, tr_Directional, tr_Building, tr_Misc, tr_Circulation, tr_Technical, tr_Reserves, tr_Reference = getFocusedROCAUC(
                                df_train = trainDataFinal,
                                df_test = testDataFinal,
                                qtypes = qtypes,
                                tag_labels = tag_labels,
                                labelledsection = labelledSection,
                            )
                            end_time = time.time()
                            total_time = end_time-start_time
                            modelParameters = {
                                'TRUNC':patronSection,
                                'TAG':getTags,
                                'DICT':str(truncate),
                                'D2V':model,
                                'READ':labelledSection,
                                
                                'AUC_train':trainAUC,
                                'AUC_test':testAUC,

                                'AUC_train_TAG':trainAUClist,
                                'AUC_test_TAG':testAUClist,
                                'AUC_train_Not_TAG':trainAUCNOTlist,
                                'AUC_test_Not_TAG':testAUCNOTlist,
                                
                                'ACC_train':trainAccScore,
                                'ACC_test':testAccScore,

                                'ACC_train_TAG':trainAccScore_list,
                                'ACC_test_TAG':trainAccScore_NOTlist,
                                'ACC_train_Not_TAG':testAccScore_list,
                                'ACC_test_Not_TAG':testAccScore_NOTlist,

                                'AUC_train_TAG_LEN':trainLlen,
                                'AUC_train_TAG_N_LEN':trainNLlen,
                                'AUC_test_TAG_LEN':testLlen,
                                'AUC_test_TAG_N_LEN':testNLlen,
                                
                                "AUC_tr_Reference": tr_Reference[0],
                                "AUC_tr_Reserves": tr_Reserves[0],
                                "AUC_tr_Technical": tr_Technical[0],
                                "AUC_tr_Circulation": tr_Circulation[0],
                                "AUC_tr_Misc": tr_Misc[0],
                                "AUC_tr_Building": tr_Building[0],
                                "AUC_tr_Directional": tr_Directional[0],
                                "AUC_tr_KREx": tr_KREx[0],
                                "AUC_tr_ResearchConsultation": tr_ResearchConsultation[0],
                                "AUC_tr_NewPrairiePress": tr_NewPrairiePress[0],
                                "AUC_tr_KAPI": tr_KAPI[0],
                                "AUC_tr_Copyright": tr_Copyright[0],
                                "AUC_tr_Unknown": tr_Unknown[0],
                                "AUC_tr_tagURL": tr_tagURL[0],
                                "AUC_tr_tagPRINTING": tr_tagPRINTING[0],
                                "AUC_tr_tagSCANNER": tr_tagSCANNER[0],
                                "AUC_tr_tagHOURS": tr_tagHOURS[0],
                                "AUC_tr_tagLIBMATHPHYS": tr_tagLIBMATHPHYS[0],
                                "AUC_tr_tagLIBWEIGEL": tr_tagLIBWEIGEL[0],
                                "AUC_tr_tagLIBVETMED": tr_tagLIBVETMED[0],
                                "AUC_tr_tagLIBHALE": tr_tagLIBHALE[0],
                                "AUC_tr_tagLIBSTACKS": tr_tagLIBSTACKS[0],
                                "AUC_tr_tagTEXTBOOKS": tr_tagTEXTBOOKS[0],
                                "AUC_tr_tagQUIET": tr_tagQUIET[0],
                                "AUC_tr_tagLIBLOCATION": tr_tagLIBLOCATION[0],
                                "AUC_tr_tagARTICLES": tr_tagARTICLES[0],
                                "AUC_tr_tagEVIDENCEBASED": tr_tagEVIDENCEBASED[0],
                                "AUC_tr_tagJUVENILE": tr_tagJUVENILE[0],
                                "AUC_tr_tagCURRICULUM": tr_tagCURRICULUM[0],
                                "AUC_tr_tagKNOWNITEMARTICLE": tr_tagKNOWNITEMARTICLE[0],
                                "AUC_tr_tagKNOWNITEMBOOK": tr_tagKNOWNITEMBOOK[0],
                                "AUC_tr_tagREFERENCE": tr_tagREFERENCE[0],
                                "AUC_ts_Reference": ts_Reference[0],
                                "AUC_ts_Reserves": ts_Reserves[0],
                                "AUC_ts_Technical": ts_Technical[0],
                                "AUC_ts_Circulation": ts_Circulation[0],
                                "AUC_ts_Misc": ts_Misc[0],
                                "AUC_ts_Building": ts_Building[0],
                                "AUC_ts_Directional": ts_Directional[0],
                                "AUC_ts_KREx": ts_KREx[0],
                                "AUC_ts_ResearchConsultation": ts_ResearchConsultation[0],
                                "AUC_ts_NewPrairiePress": ts_NewPrairiePress[0],
                                "AUC_ts_KAPI": ts_KAPI[0],
                                "AUC_ts_Copyright": ts_Copyright[0],
                                "AUC_ts_Unknown": ts_Unknown[0],
                                "AUC_ts_tagURL": ts_tagURL[0],
                                "AUC_ts_tagPRINTING": ts_tagPRINTING[0],
                                "AUC_ts_tagSCANNER": ts_tagSCANNER[0],
                                "AUC_ts_tagHOURS": ts_tagHOURS[0],
                                "AUC_ts_tagLIBMATHPHYS": ts_tagLIBMATHPHYS[0],
                                "AUC_ts_tagLIBWEIGEL": ts_tagLIBWEIGEL[0],
                                "AUC_ts_tagLIBVETMED": ts_tagLIBVETMED[0],
                                "AUC_ts_tagLIBHALE": ts_tagLIBHALE[0],
                                "AUC_ts_tagLIBSTACKS": ts_tagLIBSTACKS[0],
                                "AUC_ts_tagTEXTBOOKS": ts_tagTEXTBOOKS[0],
                                "AUC_ts_tagQUIET": ts_tagQUIET[0],
                                "AUC_ts_tagLIBLOCATION": ts_tagLIBLOCATION[0],
                                "AUC_ts_tagARTICLES": ts_tagARTICLES[0],
                                "AUC_ts_tagEVIDENCEBASED": ts_tagEVIDENCEBASED[0],
                                "AUC_ts_tagJUVENILE": ts_tagJUVENILE[0],
                                "AUC_ts_tagCURRICULUM": ts_tagCURRICULUM[0],
                                "AUC_ts_tagKNOWNITEMARTICLE": ts_tagKNOWNITEMARTICLE[0],
                                "AUC_ts_tagKNOWNITEMBOOK": ts_tagKNOWNITEMBOOK[0],
                                "AUC_ts_tagREFERENCE": ts_tagREFERENCE[0],                                
                                
                                "ACC_tr_Reference": tr_Reference[1],
                                "ACC_tr_Reserves": tr_Reserves[1],
                                "ACC_tr_Technical": tr_Technical[1],
                                "ACC_tr_Circulation": tr_Circulation[1],
                                "ACC_tr_Misc": tr_Misc[1],
                                "ACC_tr_Building": tr_Building[1],
                                "ACC_tr_Directional": tr_Directional[1],
                                "ACC_tr_KREx": tr_KREx[1],
                                "ACC_tr_ResearchConsultation": tr_ResearchConsultation[1],
                                "ACC_tr_NewPrairiePress": tr_NewPrairiePress[1],
                                "ACC_tr_KAPI": tr_KAPI[1],
                                "ACC_tr_Copyright": tr_Copyright[1],
                                "ACC_tr_Unknown": tr_Unknown[1],
                                "ACC_tr_tagURL": tr_tagURL[1],
                                "ACC_tr_tagPRINTING": tr_tagPRINTING[1],
                                "ACC_tr_tagSCANNER": tr_tagSCANNER[1],
                                "ACC_tr_tagHOURS": tr_tagHOURS[1],
                                "ACC_tr_tagLIBMATHPHYS": tr_tagLIBMATHPHYS[1],
                                "ACC_tr_tagLIBWEIGEL": tr_tagLIBWEIGEL[1],
                                "ACC_tr_tagLIBVETMED": tr_tagLIBVETMED[1],
                                "ACC_tr_tagLIBHALE": tr_tagLIBHALE[1],
                                "ACC_tr_tagLIBSTACKS": tr_tagLIBSTACKS[1],
                                "ACC_tr_tagTEXTBOOKS": tr_tagTEXTBOOKS[1],
                                "ACC_tr_tagQUIET": tr_tagQUIET[1],
                                "ACC_tr_tagLIBLOCATION": tr_tagLIBLOCATION[1],
                                "ACC_tr_tagARTICLES": tr_tagARTICLES[1],
                                "ACC_tr_tagEVIDENCEBASED": tr_tagEVIDENCEBASED[1],
                                "ACC_tr_tagJUVENILE": tr_tagJUVENILE[1],
                                "ACC_tr_tagCURRICULUM": tr_tagCURRICULUM[1],
                                "ACC_tr_tagKNOWNITEMARTICLE": tr_tagKNOWNITEMARTICLE[1],
                                "ACC_tr_tagKNOWNITEMBOOK": tr_tagKNOWNITEMBOOK[1],
                                "ACC_tr_tagREFERENCE": tr_tagREFERENCE[1],
                                "ACC_ts_Reference": ts_Reference[1],
                                "ACC_ts_Reserves": ts_Reserves[1],
                                "ACC_ts_Technical": ts_Technical[1],
                                "ACC_ts_Circulation": ts_Circulation[1],
                                "ACC_ts_Misc": ts_Misc[1],
                                "ACC_ts_Building": ts_Building[1],
                                "ACC_ts_Directional": ts_Directional[1],
                                "ACC_ts_KREx": ts_KREx[1],
                                "ACC_ts_ResearchConsultation": ts_ResearchConsultation[1],
                                "ACC_ts_NewPrairiePress": ts_NewPrairiePress[1],
                                "ACC_ts_KAPI": ts_KAPI[1],
                                "ACC_ts_Copyright": ts_Copyright[1],
                                "ACC_ts_Unknown": ts_Unknown[1],
                                "ACC_ts_tagURL": ts_tagURL[1],
                                "ACC_ts_tagPRINTING": ts_tagPRINTING[1],
                                "ACC_ts_tagSCANNER": ts_tagSCANNER[1],
                                "ACC_ts_tagHOURS": ts_tagHOURS[1],
                                "ACC_ts_tagLIBMATHPHYS": ts_tagLIBMATHPHYS[1],
                                "ACC_ts_tagLIBWEIGEL": ts_tagLIBWEIGEL[1],
                                "ACC_ts_tagLIBVETMED": ts_tagLIBVETMED[1],
                                "ACC_ts_tagLIBHALE": ts_tagLIBHALE[1],
                                "ACC_ts_tagLIBSTACKS": ts_tagLIBSTACKS[1],
                                "ACC_ts_tagTEXTBOOKS": ts_tagTEXTBOOKS[1],
                                "ACC_ts_tagQUIET": ts_tagQUIET[1],
                                "ACC_ts_tagLIBLOCATION": ts_tagLIBLOCATION[1],
                                "ACC_ts_tagARTICLES": ts_tagARTICLES[1],
                                "ACC_ts_tagEVIDENCEBASED": ts_tagEVIDENCEBASED[1],
                                "ACC_ts_tagJUVENILE": ts_tagJUVENILE[1],
                                "ACC_ts_tagCURRICULUM": ts_tagCURRICULUM[1],
                                "ACC_ts_tagKNOWNITEMARTICLE": ts_tagKNOWNITEMARTICLE[1],
                                "ACC_ts_tagKNOWNITEMBOOK": ts_tagKNOWNITEMBOOK[1],
                                "ACC_ts_tagREFERENCE": ts_tagREFERENCE[1],                                
                                
                                'CycleTime':total_time,
                                'RAND':randomSeed,
                                'Custom Vocab Len':len(customVocab),
                                'Full Vocab Len':len(vocab),
                                'Testing Split':testsplit,
                                'Training Data Full':trainShape,
                                'Testing Data Full':testShape,
                                'Training Data Final':trainDataFinal.shape[0],
                                'Testing Data Final':testDataFinal.shape[0],
                                              }

                            parameterList.append(list(modelParameters.values()))
                            print(modelParameters.values())
                            iterationcounter += 1
                            print("{} out of {} complete".format(iterationcounter,testtotal))

parameterDataFrame = pd.DataFrame(parameterList,columns=list(modelParameters.keys()))

In [None]:
# SAVE AND STORE RESULTS LOCALLY
parameterDataFrame.to_csv('dataruns/{}_Run.csv'.format(today))
trainDataFinal.to_csv('dataruns/{}_trainDataFinal.csv'.format(today))
testDataFinal.to_csv('dataruns/{}_testDataFinal.csv'.format(today))

In [None]:
# REOPEN DATA IN NEW DATAFRAME OBJECTS
timestamp = today

preservedDataFrame = pd.read_csv('dataruns/{}_preserveRun.csv'.format(timestamp),index_col=0)
trainDataFinal = pd.read_csv('dataruns/{}_trainDataFinal.csv'.format(timestamp),index_col=0)
testDataFinal = pd.read_csv('dataruns/{}_testDataFinal.csv'.format(timestamp),index_col=0)

In [None]:
# WRANGLE DATA INTO CONVENIENT FORMS AND PRODUCE DESCRIPTIVE STATISTICS

newc = [
    'AUC_train',
    'AUC_test',
    
    'AUC_train_TAG',
    'AUC_train_Not_TAG', 
    
    'AUC_test_TAG',      
    'AUC_test_Not_TAG', 
    
    'AUC_tr_Reference',
    'AUC_tr_Reserves',
    'AUC_tr_Technical',
    'AUC_tr_Circulation',
    'AUC_tr_Misc',
    'AUC_tr_Building',
    'AUC_tr_Directional',
    'AUC_tr_KREx',
    'AUC_tr_ResearchConsultation', 
    'AUC_tr_NewPrairiePress', 
    'AUC_tr_KAPI',
    'AUC_tr_Copyright', 
    'AUC_tr_Unknown',
    
    'AUC_tr_tagURL',
    'AUC_tr_tagPRINTING', 
    'AUC_tr_tagSCANNER', 
    'AUC_tr_tagHOURS',
    'AUC_tr_tagLIBMATHPHYS',
    'AUC_tr_tagLIBWEIGEL',
    'AUC_tr_tagLIBVETMED',
    'AUC_tr_tagLIBHALE', 
    'AUC_tr_tagLIBSTACKS',
    'AUC_tr_tagTEXTBOOKS',
    'AUC_tr_tagQUIET', 
    'AUC_tr_tagLIBLOCATION',
    'AUC_tr_tagARTICLES',
    'AUC_tr_tagEVIDENCEBASED',
    'AUC_tr_tagJUVENILE', 
    'AUC_tr_tagCURRICULUM',
    'AUC_tr_tagKNOWNITEMARTICLE', 
    'AUC_tr_tagKNOWNITEMBOOK',
    'AUC_tr_tagREFERENCE',
    
    'AUC_ts_Reference',
    'AUC_ts_Reserves',
    'AUC_ts_Technical',
    'AUC_ts_Circulation', 
    'AUC_ts_Misc',
    'AUC_ts_Building',
    'AUC_ts_Directional', 
    'AUC_ts_KREx',
    'AUC_ts_ResearchConsultation',
    'AUC_ts_NewPrairiePress',
    'AUC_ts_KAPI',
    'AUC_ts_Copyright',
    'AUC_ts_Unknown',
    
    'AUC_ts_tagURL',
    'AUC_ts_tagPRINTING',
    'AUC_ts_tagSCANNER',
    'AUC_ts_tagHOURS',
    'AUC_ts_tagLIBMATHPHYS',
    'AUC_ts_tagLIBWEIGEL', 
    'AUC_ts_tagLIBVETMED',
    'AUC_ts_tagLIBHALE',
    'AUC_ts_tagLIBSTACKS', 
    'AUC_ts_tagTEXTBOOKS',
    'AUC_ts_tagQUIET', 
    'AUC_ts_tagLIBLOCATION',
    'AUC_ts_tagARTICLES',
    'AUC_ts_tagEVIDENCEBASED',
    'AUC_ts_tagJUVENILE',
    'AUC_ts_tagCURRICULUM',
    'AUC_ts_tagKNOWNITEMARTICLE',
    'AUC_ts_tagKNOWNITEMBOOK',
    'AUC_ts_tagREFERENCE',
    'ACC_train',
    'ACC_test',
    
    'ACC_train_TAG',
    'ACC_train_Not_TAG', 
    
    'ACC_test_TAG',      
    'ACC_test_Not_TAG', 
    
    'ACC_tr_Reference',
    'ACC_tr_Reserves',
    'ACC_tr_Technical',
    'ACC_tr_Circulation',
    'ACC_tr_Misc',
    'ACC_tr_Building',
    'ACC_tr_Directional',
    'ACC_tr_KREx',
    'ACC_tr_ResearchConsultation', 
    'ACC_tr_NewPrairiePress', 
    'ACC_tr_KAPI',
    'ACC_tr_Copyright', 
    'ACC_tr_Unknown',
    
    'ACC_tr_tagURL',
    'ACC_tr_tagPRINTING', 
    'ACC_tr_tagSCANNER', 
    'ACC_tr_tagHOURS',
    'ACC_tr_tagLIBMATHPHYS',
    'ACC_tr_tagLIBWEIGEL',
    'ACC_tr_tagLIBVETMED',
    'ACC_tr_tagLIBHALE', 
    'ACC_tr_tagLIBSTACKS',
    'ACC_tr_tagTEXTBOOKS',
    'ACC_tr_tagQUIET', 
    'ACC_tr_tagLIBLOCATION',
    'ACC_tr_tagARTICLES',
    'ACC_tr_tagEVIDENCEBASED',
    'ACC_tr_tagJUVENILE', 
    'ACC_tr_tagCURRICULUM',
    'ACC_tr_tagKNOWNITEMARTICLE', 
    'ACC_tr_tagKNOWNITEMBOOK',
    'ACC_tr_tagREFERENCE',
    
    'ACC_ts_Reference',
    'ACC_ts_Reserves',
    'ACC_ts_Technical',
    'ACC_ts_Circulation', 
    'ACC_ts_Misc',
    'ACC_ts_Building',
    'ACC_ts_Directional', 
    'ACC_ts_KREx',
    'ACC_ts_ResearchConsultation',
    'ACC_ts_NewPrairiePress',
    'ACC_ts_KAPI',
    'ACC_ts_Copyright',
    'ACC_ts_Unknown',
    
    'ACC_ts_tagURL',
    'ACC_ts_tagPRINTING',
    'ACC_ts_tagSCANNER',
    'ACC_ts_tagHOURS',
    'ACC_ts_tagLIBMATHPHYS',
    'ACC_ts_tagLIBWEIGEL', 
    'ACC_ts_tagLIBVETMED',
    'ACC_ts_tagLIBHALE',
    'ACC_ts_tagLIBSTACKS', 
    'ACC_ts_tagTEXTBOOKS',
    'ACC_ts_tagQUIET', 
    'ACC_ts_tagLIBLOCATION',
    'ACC_ts_tagARTICLES',
    'ACC_ts_tagEVIDENCEBASED',
    'ACC_ts_tagJUVENILE',
    'ACC_ts_tagCURRICULUM',
    'ACC_ts_tagKNOWNITEMARTICLE',
    'ACC_ts_tagKNOWNITEMBOOK',
    'ACC_ts_tagREFERENCE',
       ]

statDF = pd.DataFrame(index=newc)

statDF["model_count"] = preservedDataFrame.loc[:,newc].count().copy()
statDF["min"] = preservedDataFrame.loc[:,newc].min().copy()
statDF["max"] = preservedDataFrame.loc[:,newc].max().copy()
statDF["mean"] = preservedDataFrame.loc[:,newc].mean().copy()
statDF["std"] = preservedDataFrame.loc[:,newc].std().copy()
statDF["t-score"] = (statDF["mean"] - 0.5) / (statDF["std"] / np.sqrt(statDF["model_count"]))
statDF["p-value"] = 1 - stats.t.cdf(x=statDF["t-score"], df=statDF["model_count"]-1-5)

catcolumns = [
    'Reference',
    'Reserves',
    'Technical', 
    'Circulation',
    'Misc',
    'Building', 
    'Directional',
    'KREx',
    'ResearchConsultation',
    'NewPrairiePress', 
    'KAPI',
    'Copyright', 
    'Unknown',
    'tagURL',
    'tagPRINTING',
    'tagSCANNER',
    'tagHOURS',
    'tagLIBMATHPHYS',
    'tagLIBWEIGEL',
    'tagLIBVETMED', 
    'tagLIBHALE',
    'tagLIBSTACKS',
    'tagTEXTBOOKS',
    'tagQUIET',
    'tagLIBLOCATION',
    'tagARTICLES',
    'tagEVIDENCEBASED',
    'tagJUVENILE',
    'tagCURRICULUM',
    'tagKNOWNITEMARTICLE', 
    'tagKNOWNITEMBOOK', 
    'tagREFERENCE'
]

train_category_counts = trainDataFinal.loc[:,catcolumns].copy().sum()
train_category_counts = train_category_counts.add_prefix("AUC_tr_")
train_category_counts = pd.DataFrame(train_category_counts, columns=["n"])

test_category_counts = testDataFinal.loc[:,catcolumns].copy().sum()
test_category_counts = test_category_counts.add_prefix("AUC_ts_")
test_category_counts = pd.DataFrame(test_category_counts, columns=["n"])

counts = pd.concat([train_category_counts,test_category_counts])

statDF = pd.merge(statDF,counts, left_index=True,right_index=True, how="outer")

statDF.to_csv("dataruns/{}_statblock.csv".format(timestamp))

In [None]:
# SET DEFAULT SEABORN PLOTTING SETTINGS
sns.set()

# GENERATE PLOTS FOR ROC AUC PERFORMANCE
# FOCUS: HIGH LEVEL SUBSETS OF SAMPLES
traintest = [
    [ "AUC_train", "Training Data",10162,4],
    [ "AUC_test", "Testing Data",1753,4],

]

plt.figure(figsize=(15,10))

for i in traintest:
    ax = sns.distplot(
    a=preservedDataFrame[i[0]],
    hist=False,
    norm_hist=True,
    label="{} ({} Samples)".format(i[1],i[2]),
    kde_kws={
        "shade":True,
        "lw":i[3]
    }
    )

ax.set_xlim(0,1)
ax.set_xlabel("AUC Score", fontsize="x-large")
ax.set_ylabel("Density", fontsize="x-large")

plt.title(
    "Distribution of ROC AUC Scores for full Training and Testing Datasets", 
    fontdict={"fontsize":"x-large"}
)

plt.axvline( x=0.5, ls="--", c="gray", label="0.5 Cutoff")
plt.legend(fontsize="large")
plt.savefig("plots/traintest_{}.png".format(timestamp))
plt.show()

In [None]:
# GENERATE PLOTS FOR ROC AUC PERFORMANCE
# FOCUS: SUBSETS OF SAMPLES SEGMENTED BY SELECTED QUESTION TYPES (TRAINING DATA)
qtypes = [
    [ "AUC_tr_Building", "Building",709,2],
    [ "AUC_tr_Circulation", "Circulation",473,2],
    [ "AUC_tr_Directional", "Directional",200,2],
    [ "AUC_tr_Misc", "Misc",917,3],
    [ "AUC_tr_Reference", "Reference",6743,4],
    [ "AUC_tr_Technical", "Technical",907,3],
]

plt.figure(figsize=(15,10))

for i in qtypes:
    ax = sns.distplot(
    a=preservedDataFrame[i[0]],
    hist=False,
    norm_hist=True,
    label="{} ({} Samples)".format(i[1],i[2]),
    kde_kws={
        "lw":i[3]
    }
    )

ax.set_xlim(0,1)
ax.set_xlabel("AUC Score", fontsize="x-large")
ax.set_ylabel("Density", fontsize="x-large")

plt.title(
    "TRAINING DATA - Distribution of ROC AUC Scores for Select Question Types", 
    fontdict={"fontsize":"x-large"}
)

plt.axvline( x=0.5, ls="--", c="gray", label="0.5 Cutoff")
plt.legend(fontsize="x-large", loc="upper left", shadow=True)
plt.savefig("plots/qtypes_training_{}.png".format(timestamp))
plt.show()

In [None]:
# GENERATE PLOTS FOR ROC AUC PERFORMANCE
# FOCUS: SUBSETS OF SAMPLES SEGMENTED BY SELECTED QUESTION TYPES (TESTING DATA)
qtypes = qtypes = [
    [ "AUC_ts_Building", "Building",70,2],
    [ "AUC_ts_Circulation", "Circulation",88,2],
    [ "AUC_ts_Directional", "Directional",35,2],
    [ "AUC_ts_Misc", "Misc",172,3],
    [ "AUC_ts_Reference", "Reference",1167,4],
    [ "AUC_ts_Technical", "Technical",179,3],
]

plt.figure(figsize=(15,10))

for i in qtypes:
    ax = sns.distplot(
    a=preservedDataFrame[i[0]],
    hist=False,
    norm_hist=True,
    label="{} ({} Samples)".format(i[1],i[2]),
    kde_kws={
        "lw":i[3]
    }
    )

ax.set_xlim(0,1)
ax.set_xlabel("AUC Score", fontsize="x-large")
ax.set_ylabel("Density", fontsize="x-large")

plt.title(
    "TESTING DATA - Distribution of ROC AUC Scores for Select Question Types", 
    fontdict={"fontsize":"x-large"}
)

plt.axvline( x=0.5, ls="--", c="gray", label="0.5 Cutoff")
plt.legend(fontsize="x-large", loc="upper left", shadow=True)
plt.savefig("plots/qtypes_testing_{}.png".format(timestamp))
plt.show()