In [96]:
import numpy as np
import pandas as pd
import re
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.neural_network import MLPClassifier
from scipy.stats import ttest_ind

%set_env PYTHONHASHSEED=1
# %matplotlib inline
import itertools
import tqdm as tqdm

from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec, Doc2Vec, LdaModel, TfidfModel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_corpus, common_dictionary
from gensim.parsing.preprocessing import strip_punctuation, strip_short, strip_non_alphanum, strip_tags, strip_multiple_whitespaces, remove_stopwords
from gensim.parsing.preprocessing import preprocess_string
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import STOPWORDS
from gensim.matutils import corpus2dense

env: PYTHONHASHSEED=1


In [98]:
today = re.sub(string=str(datetime.datetime.today()), pattern="\W", repl="")

In [99]:
today

'20190528085750160508'

In [101]:
rawDdata = pd.read_json('full.json')
rawDdata.shape

(14897, 34)

In [102]:
rawDdata['Start Date'] =  pd.to_datetime(rawDdata['Start Date'],unit='ms')
rawDdata['End Date'] =  pd.to_datetime(rawDdata['End Date'],unit='ms')
rawDdata.to_csv("fiddlingRaw.csv")

In [103]:
rawDdata = rawDdata[rawDdata['segCode']==0]
rawDdata.shape

(14604, 34)

In [104]:
rawDdata = rawDdata.sort_index()

In [105]:
rawDdata.loc[13751,]

Id                                                                       39359102
Start Date                                                    2018-02-28 00:00:00
Start Time                                                               16:59:00
End Date                                                      2018-03-06 00:00:00
End Time                                                                     NULL
Duration                                                                     NULL
Entered by                                                          Jason Coleman
Notes                                                                        NULL
Question                                             Trouble locating QL49 .J4453
Answer                          4:59 25870693270869823292870615 transfer from ...
Notes FULL                                                                   NULL
Library Dept./Branch/Service                                    Hale Library Help
Where were you? 

In [106]:
rawDdata = rawDdata.sort_values(by='Start Date')

In [None]:
rawDdata.head(10)

In [108]:
def getPatronSections(df,breaks):
    st = time.time()
    
    df = df.copy()
    corpus = df['PatronTextString'].str.split(pat="\s{1,}").copy()
    
    for i in corpus.index:
        
        df.loc[i,'First5'] = " ".join(corpus.loc[i][:breaks[0]])
        df.loc[i,'First10'] = " ".join(corpus.loc[i][:breaks[1]])
        df.loc[i,'First20'] = " ".join(corpus.loc[i][:breaks[2]])
    
    et = time.time() - st
    print('{:.2f} : Splitting Patron Lines'.format(et))    
    return(df)  

patronSegmentsOptions = [5,10,20]
rawDdata = getPatronSections(df=rawDdata,breaks=patronSegmentsOptions)

32.71 : Splitting Patron Lines


In [109]:
rawDdata.loc[13751,]

Id                                                                       39359102
Start Date                                                    2018-02-28 00:00:00
Start Time                                                               16:59:00
End Date                                                      2018-03-06 00:00:00
End Time                                                                     NULL
Duration                                                                     NULL
Entered by                                                          Jason Coleman
Notes                                                                        NULL
Question                                             Trouble locating QL49 .J4453
Answer                          4:59 25870693270869823292870615 transfer from ...
Notes FULL                                                                   NULL
Library Dept./Branch/Service                                    Hale Library Help
Where were you? 

In [110]:
rawDdata.dtypes

Id                                       int64
Start Date                      datetime64[ns]
Start Time                              object
End Date                        datetime64[ns]
End Time                                object
Duration                                object
Entered by                              object
Notes                                   object
Question                                object
Answer                                  object
Notes FULL                              object
Library Dept./Branch/Service            object
Where were you?                         object
Who answers                             object
Who Asked?                              object
How many in the group?                  object
Question Format                         object
Question Type                           object
Referred to?                            object
READ                                    object
Time Spent                              object
Class/Discipl

In [111]:
rawDdata.columns

Index(['Id', 'Start Date', 'Start Time', 'End Date', 'End Time', 'Duration',
       'Entered by', 'Notes', 'Question', 'Answer', 'Notes FULL',
       'Library Dept./Branch/Service', 'Where were you?', 'Who answers',
       'Who Asked?', 'How many in the group?', 'Question Format',
       'Question Type', 'Referred to?', 'READ', 'Time Spent',
       'Class/Discipline', 'tags', 'Room reservation', 'Reported to:',
       'READ_1_vs_2', 'READ_2_vs_3', 'Transcript', 'TransLength', 'segTrans',
       'segCode', 'PatronTextString', 'StaffTextString', 'AllTextString',
       'First5', 'First10', 'First20'],
      dtype='object')

In [115]:
manualTags = [
    ['tagURL',[
        re.escape('amazon.com'),
        re.escape('newfirstsearch'),
        re.escape('galegroup'),
        re.escape('ingentaconnect.com'),
        re.escape('proquest.com'),
        re.escape('ncbi.nlm.nih.gov'),
        re.escape('sciencedirect.com'),
        re.escape('springer.com'),
        re.escape('tandfonline.com'),
        re.escape('webofknowledge'),
        re.escape('wiley.com'),
        re.escape('books.google'),
        re.escape('google.com'),

        re.escape('apps.lib.k-state.edu/databases'),

        re.escape('er.lib.ksu.edu'),
        re.escape('er.lib.k-state.edu'),

        re.escape('getit.lib.ksu.edu'),
        re.escape('getit.lib.k-state.edu'),

        re.escape('guides.lib.ksu.edu'),
        re.escape('guides.lib.k-state.edu'),

        re.escape('catalog.lib.ksu.edu'),
        re.escape('catalog2.lib.ksu.edu'),
        re.escape('catalog.lib.k-state.edu'),
        re.escape('catalog2.lib.k-state.edu'),

        re.escape('primo.hosted.exlibrisgroup.com'),
        re.escape('na02.alma.exlibrisgroup'),

        re.escape('searchit.lib.ksu.edu'),
        re.escape('searchit.lib.k-state.edu'),

        re.escape('lib.k-state.edu'),
        re.escape('lib.k-state.edu'),

        re.escape('doi.org'),

        re.escape('http'),
        re.escape('www.'),]
    ],
    
    ['tagPRINTING',[
        'color print',
        'colored print',
        'print in color',
        'print something in color',
        "\Win color\W",
        "cat cash",
        'printer',
        '(?<!3D\s)\bprinting',
        'double.{1}sided',
        'catcash',
        'cat cash',
        'add money',]
    ],    
    
    ['tagSCANNER',[
        'scanner',
        '\Wscan\W',]
    ],      
    
    ['tagHOURS',[
        'open 24/7',
        'what time',
        'the hours',
        'opens{0,1}\W',
        'will be open',
        'summer hours',
        'library hours',]
    ],    
    
    
    ['tagLIBMATHPHYS',[
        re.escape('Math/Physics Library'),
        re.escape('math and physics library'),
        re.escape('Math Physic library'),
        re.escape('math/physics library'),
        re.escape('maths/phys library'),
        re.escape('math & phys library'),
        re.escape('math phys library'),]
    ],
    
    ['tagLIBWEIGEL',[
        'weigel',
        'wiegel',]
    ],
    
    ['tagLIBVETMED',[
        'vet med',
        'vetmed',]
    ],
    
    ['tagLIBHALE',[
        'Hale Library',
        "(?<!help\s)hale",]
    ],
    
    ['tagLIBSTACKS',[
        'Library Stacks',
        'the stacks',
        'in Stacks',]
    ],
    
    ['tagTEXTBOOKS',[
        'the reserve',
        'on reserve',
        'course reserve',
        'reserve textbook',
        'have a specific textbook',
        'have the textbook',
        'have textbook',
        'this textbook',
        'this text book',]
    ],
    
    ['tagQUIET',[
        'quite loud',
        'super loud',
        'really loud',
        'very loud',
        'stop talking',
        'talking on',
        'music loud',
        "loud",
        "talking very",
        "talking extremely",
        "talking loud",
        'quiet floor',
        '"quiet" floor',
        'Quiet Zone',
        'quiet floors',
        'floor to be quiet',
        'whisper quietly',
        'be quiet',
        'floor to be quiet',]
    ],
    
    
    ['tagLIBLOCATION',[
        'first floor',
        '1st floor',
        'second floor',
        '2nd floor',
        'third floor',
        '3rd floor',
        'fourth floor',
        '4t floor',
        'fifth floor',
        '5th floor',
        'hemisphere room',
        'Harry Potter room',
        'the hemi',]
    ],
    
    ['tagARTICLES',[
        "peer.{,1}review",
        'journal article',
        'scholarly article',
        'scholarly journal',
        '"scholarly article',
        'peer reviewed',
        re.escape('peer-reviewed'),
        'peerreviewed',
        'scholarly',
        'articles',]
    ],
    
    ['tagEVIDENCEBASED',[
        "evidence.based",
        "kinesiology",]
    ],
    
    ['tagJUVENILE',[
        "juv lit section",
        "Juvenile Literature",
        re.escape("juv. lit"),
        "children'{0,1}s collection",
        "children'{0,1}s lit",
        "children'{0,1}s stor",
        re.escape("children's boooks"),
        "(?<!Germany on English )children'{0,1}s book",
        re.escape("children's picture"),
        "picture book",]
    ],
    
    
    ['tagCURRICULUM',[
        "curriculum materials",
        "curriculum books",]
    ],
    
    
    ['tagKNOWNITEMARTICLE',[
        "doi\W\s{,1}\S+",
        "doi:{0,1}\s{0,1}\d\S+",
        "this article",
        "this\s\w+\sarticle",
        'this paper',
        'doi\.\S+',
        'doi:{0,1}\s{0,1}\d\S+',
        'doi\.org\S+',]
    ],
    
    
    ['tagKNOWNITEMBOOK',[
        "[a-z]{1,2}\d{2,4}\s{0,1}\.[a-z]\d{1,}",
        'this book',]
    ],
    
    
    
    ['tagREFERENCE',[
        "articles{0,1}\sabout",
        "books{0,1}\sabout",
        "subject",
        "topic",
        "a paper on",
        "help me find an{0,1}"]
    ],
    
    
]

rollupTags = [
    ['tagEASIER',[
        'tagKNOWNITEMBOOK',
        'tagLIBHALE',
        'tagLIBLOCATION',
        'tagLIBMATHPHYS',
        'tagLIBSTACKS',
        'tagLIBWEIGEL',
        'tagHOURS',
        'tagPRINTING',
        'tagQUIET',
        'tagSCANNER',
        'tagTEXTBOOKS',]
    ],
    
    ['tagHARDER',[
        'tagARTICLES',
        'tagCURRICULUM',
        'tagEVIDENCEBASED',
        'tagJUVENILE',
        'tagKNOWNITEMARTICLE',
        'tagREFERENCE',
        'tagURL',]
    ],
]

In [116]:
def getManualTags(df,collectTags,manualTagsList,section,rollups):
    st = time.time()
    
    df = df.copy()
    df['manualTags'] = [[] for i in range(df.shape[0])] 
    
    if collectTags == True:
        taglist = pd.Series()
        for i in df.index:
            tags = []
            for k in manualTagsList:
                if k[0] in tags:
                    continue
                for m in k[1]:
                    if re.search(pattern=m,flags=re.IGNORECASE,string=df.loc[i,section]):
                        tags.append(k[0])                
            if rollups == True:
                for n in rollupTags:
                    if n[0] in tags:
                        continue
                    for o in n[1]:
                        if o in tags:
                            tags.append(n[0])
            tags = list(np.unique(tags))
            taglist.loc[i] = tags
        df.loc[taglist.index,'manualTags'] = taglist
    
    et = time.time() - st
    print('{:.2f} : Getting Manual Tags'.format(et))
    if rollups:
        print('Including Rollup Tags')
    return(df)

In [120]:
def getTextsTokens(df,corpus,model):
    st = time.time()
    
    corpus = corpus.copy()
    df = df.copy()
    CUSTOM_FILTERS = [lambda x: x.lower(),
                  strip_multiple_whitespaces,
#                   strip_non_alphanum,
#                   strip_short,
                 ]
    
    tokenizedTexts = [preprocess_string(x, CUSTOM_FILTERS) for x in corpus]
    tokenizedTexts = pd.Series(tokenizedTexts,index=corpus.index)
    
    
#     if model == 'lda':
#         for i in df.index:
#             for k in df.loc[i,'DocTags']:
#                 tokenizedTexts.loc[i] = tokenizedTexts.loc[i].append(k)
    
    gensim_dictionary = Dictionary(tokenizedTexts)
    
    et = time.time() - st
    print('{:.2f} : Tokenizing Texts'.format(et))
    return(tokenizedTexts,gensim_dictionary)

In [121]:
def getTFIDFlimited(df, section, truncate, controlVocab, model):
    st = time.time()
    df = df.copy()
    texts = df.loc[:,section].copy()
    
    fullVocab = CountVectorizer().fit(texts).get_feature_names()
    texts = texts.str.lower().copy()
    
    if truncate[0] == True:
        count = CountVectorizer(
            token_pattern=r"(?u)\b\w{3,}\b",
            min_df=truncate[1],
        )
    elif truncate[0] == False:
        count = CountVectorizer(
            token_pattern=r"(?u)\b\w{1,}\b",
        )
    
    countDF = pd.DataFrame(count.fit_transform(texts).todense(),
                           index=df.index,
                           columns=count.get_feature_names())
        
    tfidf = TfidfTransformer()
    tfidfDF = pd.DataFrame(tfidf.fit_transform(countDF).todense(),
                           index=df.index,
                           columns=count.get_feature_names())    
    countList = countDF.sum().sort_values(ascending=False)
    tfidfList = tfidfDF.mean().sort_values(ascending=False)
    
    
    if truncate[0] == True:
        vocab = list(set(countList.index[:truncate[2]]).union(set(tfidfList.index[:truncate[2]])))
#         vocab = list(tfidfList.index[:truncate[2]])
        remove = list(set(count.get_feature_names()) ^ set(fullVocab))
        
    else:
        vocab = list(set(countList.index).union(set(tfidfList.index)))
        remove = list(set(count.get_feature_names()) ^ set(fullVocab))
        
    vocab = vocab + controlVocab
    vocab = set(list(np.unique(vocab)))
        
    newTexts = pd.Series()
    for i in texts.index:
        k = re.split(string=texts.loc[i],pattern="\W")
        j = []
        for m in k:
            if m in vocab:
                j.append(m)

        newTexts.loc[i] = " ".join(j)
    
    tokenizedTexts,gendict = getTextsTokens(df=df,corpus=newTexts,model=model)    
    
    df.loc[newTexts.index,'filteredTexts'] = newTexts
    df.loc[tokenizedTexts.index,'tokenizedTexts'] = tokenizedTexts
    
    et = time.time() - st
    print('{:.2f} : Truncating by TFIDF, maybe'.format(et))              
    return(df,countDF,countList,tfidfList,remove,vocab,gendict)

In [122]:
def getLDAmatrix(df,dictionary,collectTags,random):
    st = time.time()
    
    rand = random
    %set_env PYTHONHASHSEED=rand
    
    df = df.copy()
    tokenizedTexts = list(df['tokenizedTexts'])
        
    tokenizedList_bow = [dictionary.doc2bow(x) for x in tokenizedTexts]
    
    ntop = 75
    iterations = 100
    
    ldaModel = LdaModel(
        corpus=tokenizedList_bow,
        id2word=dictionary, 
        num_topics=ntop, 
        random_state=rand,
        chunksize=2000,
        passes=5,
        update_every=1,
        alpha=1/ntop,
        eta=1/ntop, 
        decay=0.7,
        offset=1.0, 
        eval_every=100, 
        iterations=iterations, 
        gamma_threshold=0.001,
        minimum_probability=0.01,
        minimum_phi_value=0.01
    )
    
    
    ldaDocMatrix = [i for i in ldaModel.get_document_topics(tokenizedList_bow)]
    ldaDF = pd.DataFrame(corpus2dense(ldaDocMatrix, num_terms=ldaModel.num_topics).transpose(), index=df.index)
    
    if collectTags:
        counter = CountVectorizer(binary=True)

        tagstrings = df['manualTags'].str.join(" ")    
        tagDF = pd.DataFrame(counter.fit_transform(tagstrings).todense(),
                             columns=counter.get_feature_names(),
                             index=df.index)

        ldacolumntags = counter.get_feature_names()
        ldaDF = ldaDF.merge(tagDF,left_index=True,right_index=True,suffixes=(False,False))
    else:
        ldacolumntags = None
    
    et = time.time() - st
    print('{:.2f} : Getting LDA Model / DF'.format(et))
    return(ldaModel, ldaDF, ldacolumntags)

In [123]:
def doc2vecModel(df,random):
    st = time.time()
    df = df.copy()
    
    rand = random
    %set_env PYTHONHASHSEED=rand
    
    documentsTrain = [TaggedDocument(df.loc[i,'tokenizedTexts'], (
        [str(df.loc[i,'Id'])]+df.loc[i,'manualTags'])) for i in df.index]
    
    iterations = 100
    
    %set_env PYTHONHASHSEED=rand
    model = Doc2Vec(
        documentsTrain,
        dm=0,
        dbow_words=1,
        vector_size=75, 
        window=5, 
        min_count=1, 
        workers=1,
        seed=rand,
        epocs=iterations,
        hs=1,
        negative=0
               )
    
#     vectorDF = pd.DataFrame(model.docvecs.doctag_syn0, index=list(model.docvecs.doctags.keys()))
#     vectorDF = vectorDF.sort_index()
#     vectorDF = vectorDF[:len(df.index)].copy()
    
#     docid = [int(z[2:]) for z in vectorDF.index]
#     vectorDF.index = docid
#     vectorDF = vectorDF.sort_index()
    %set_env PYTHONHASHSEED=rand
    indic = []
    dat = []
    for i in df.index:
#         print(i)
        indic.append(i)
#         dat.append(model.infer_vector((df.loc[i,'tokenizedTexts']+df.loc[i,'manualTags']), steps=iterations))
        dat.append(model.infer_vector((df.loc[i,'tokenizedTexts']), steps=iterations))
        
    vectorDF = pd.DataFrame(dat,index=indic)
    
    et = time.time() - st
    print('{:.2f} : Getting D2V Model / DF'.format(et))
    return(model, vectorDF)    

In [124]:
def trainModel(df,reps,targetLabels,random,tagState):
    st = time.time()
    df = df.copy()
    reps = reps
    rand = random
    %set_env PYTHONHASHSEED=rand
    
    labels = df.loc[df[targetLabels]!=999,targetLabels].astype(int).copy()
    
    indices = labels.index
    X = reps.loc[indices,:]
    y = labels
    
    neural_model = MLPClassifier(hidden_layer_sizes=(10,),max_iter=100,random_state=rand)
    neural_model.fit(X,y)
    yprob = neural_model.predict_proba(X)[:,1]
    yprob_pred = neural_model.predict(X)
    RocAucScore = roc_auc_score(y_true=y,y_score=yprob)
    
    
    listIndex = []
    notIndex = []
    if tagState:
        for i in df.loc[labels.index,].index:
            if len(df.loc[i,'manualTags']) > 0:
                listIndex.append(int(i))
            else:
                notIndex.append(int(i))
    
        Xlist = representation.loc[listIndex,:]
        y_prob_list = neural_model.predict_proba(Xlist)[:,1]
        yprob_pred_list = neural_model.predict(Xlist)        
        
        RocAucScore_list = roc_auc_score(y_true=labels.loc[listIndex],y_score=y_prob_list)
        df.loc[listIndex,'PredictProbList'] = y_prob_list
        df.loc[listIndex,'PredictPredList'] = yprob_pred_list

        
        
        Xlist = representation.loc[notIndex,:]
        y_prob_list = neural_model.predict_proba(Xlist)[:,1]
        yprob_pred_list = neural_model.predict(Xlist)
        
        RocAucScore_NOTlist = roc_auc_score(y_true=labels.loc[notIndex],y_score=y_prob_list)
        df.loc[notIndex,'PredictProbNOTList'] = y_prob_list
        df.loc[notIndex,'PredictPredNOTList'] = yprob_pred_list
    
    
    
    # DEFAULT STATE
    else:
        RocAucScore_list = RocAucScore
        RocAucScore_NOTlist = RocAucScore
        df.loc[labels.index,'PredictProbList'] = yprob
        df.loc[labels.index,'PredictPredList'] = yprob_pred
    
    
    df.loc[labels.index,'PredictProb'] = yprob
    df.loc[labels.index,'PredictPred'] = yprob_pred
    
    et = time.time() - st
    print('{:.2f} : Running MLP Fit and Eval'.format(et))
    return(neural_model,RocAucScore,RocAucScore_list,RocAucScore_NOTlist,len(listIndex),len(notIndex),df.loc[labels.index,:],X)

In [125]:
def testModel(df,targetLabels,modelType,dictionary,neural_model,collectTags,ldajoincolumns,tagState):
    st = time.time()
    
    df = df.copy()
    
    labels = df.loc[df[targetLabels]!=999,targetLabels].astype(int).copy()
    df_test = df.loc[labels.index,].copy()
    
    if modelType == 'lda':
        tokenizedTexts = list(df_test['tokenizedTexts'])
        tokenizedList_bow = [dictionary.doc2bow(x) for x in tokenizedTexts]
        
        ldaDocMatrix = [i for i in ldaModel.get_document_topics(tokenizedList_bow)]
        representation = pd.DataFrame(corpus2dense(ldaDocMatrix, num_terms=ldaModel.num_topics).transpose(), index=df_test.index)
        
        
        if collectTags:
            counter = CountVectorizer(vocabulary=ldajoincolumns,binary=True)
            tagstrings = df_test['manualTags'].str.join(" ")
            tagDF = pd.DataFrame(counter.fit_transform(tagstrings).todense(),
                                 columns=ldajoincolumns,
                                 index=df_test.index)
            representation = representation.merge(tagDF,left_index=True,right_index=True,suffixes=(False,False))
            
    
    if modelType == 'd2v':
        indic = []
        dat = []
        for i in df_test.index:
#             print(i)
            indic.append(i)
#             dat.append(d2vModel.infer_vector((df_test.loc[i,'tokenizedTexts']+df_test.loc[i,'manualTags']), steps=100))
            dat.append(d2vModel.infer_vector((df_test.loc[i,'tokenizedTexts']), steps=100))

        representation = pd.DataFrame(dat,index=indic)
    
    X = representation
    yTest_prob = neural_model.predict_proba(X)[:,1]
    yTest_pred = neural_model.predict(X)
    RocAucScore = roc_auc_score(y_true=labels,y_score=yTest_prob)
    
    listIndex = []
    notIndex = []
    if tagState:
        for i in df.loc[labels.index,].index:
            if len(df.loc[i,'manualTags']) > 0:
                listIndex.append(int(i))
            else:
                notIndex.append(int(i))
    
        Xlist = representation.loc[listIndex,:]
        y_prob_list = neural_model.predict_proba(Xlist)[:,1]
        yprob_pred_list = neural_model.predict(Xlist)        
        
        RocAucScore_list = roc_auc_score(y_true=labels.loc[listIndex],y_score=y_prob_list)
        df.loc[listIndex,'PredictProbList'] = y_prob_list
        df.loc[listIndex,'PredictPredList'] = yprob_pred_list

        
        
        Xlist = representation.loc[notIndex,:]
        y_prob_list = neural_model.predict_proba(Xlist)[:,1]
        yprob_pred_list = neural_model.predict(Xlist)
        
        RocAucScore_NOTlist = roc_auc_score(y_true=labels.loc[notIndex],y_score=y_prob_list)
        df.loc[notIndex,'PredictProbNOTList'] = y_prob_list
        df.loc[notIndex,'PredictPredNOTList'] = yprob_pred_list
    
    
    
    # DEFAULT STATE
    else:
        RocAucScore_list = RocAucScore
        RocAucScore_NOTlist = RocAucScore
        df.loc[labels.index,'PredictProbList'] = yTest_prob
        df.loc[labels.index,'PredictPredList'] = yTest_pred
    
    
    df.loc[labels.index,'PredictProb'] = yTest_prob
    df.loc[labels.index,'PredictPred'] = yTest_pred
    
    et = time.time() - st
    print('{:.2f} : Testing Model with Holdout Data'.format(et))
    return(RocAucScore,RocAucScore_list,RocAucScore_NOTlist,len(listIndex),len(notIndex),df.loc[labels.index,:],X)
    

In [126]:
def prepareTestingData(df,vocab,section):
    st = time.time()
    
    df = df.copy()    
    df['DocTags'] = [[] for i in range(df.shape[0])] 
    
    
    splitStrings = df.loc[:,section].str.lower()
    splitStrings = splitStrings.str.split("\W")
    
    testTokens = pd.Series()
    for i in splitStrings.index:
        j = []
        for m in splitStrings.loc[i]:
            if m in vocab:
                j.append(m)

        testTokens.loc[i] = j
    df.loc[testTokens.index,'tokenizedTexts'] = testTokens
    
    
    et = time.time() - st
    print('{:.2f} : Preparing Test Holdout Data'.format(et))
    return(df)

In [127]:
def getFocusedROCAUC(df_test,df_train,qtype,labelledsection,tagState):
    st = time.time()
    df_test = df_test.copy()
    df_train = df_train.copy()
    
    df_train = df_train.loc[df_train['READ']!='Unknown',]
    ytrue = df_train.loc[df_train['Question Type']==qtype,labelledSection]
    ypred = df_train.loc[df_train['Question Type']==qtype,'PredictProb']
    trainRocAucScore = roc_auc_score(y_true=ytrue,y_score=ypred)
    
    df_test = df_test.loc[df_test['READ']!='Unknown',]
    ytrue = df_test.loc[df_test['Question Type']==qtype,labelledSection]
    ypred = df_test.loc[df_test['Question Type']==qtype,'PredictProb']
    testRocAucScore = roc_auc_score(y_true=ytrue,y_score=ypred)
    
    if tagState:
        trainList = []
        for i in df_train.index:
            if len(df_train.loc[i,'manualTags']) > 0:
                trainList.append(i)
        testList = []
        for i in df_test.index:
            if len(df_test.loc[i,'manualTags']) > 0:
                trainList.append(i)
    
#         trainList = list(set(consolidatedList).intersection(set(df_train.index)))
#         testList = list(set(consolidatedList).intersection(set(df_test.index)))

        df_train_list = df_train.loc[trainList,].copy()
        ytrue_list = df_train_list.loc[df_train_list['Question Type']==qtype,labelledSection]
        ypred_list = df_train_list.loc[df_train_list['Question Type']==qtype,'PredictProbList']
        try:
            trainRocAucScore_list = roc_auc_score(y_true=ytrue_list,y_score=ypred_list)
        except:
            ypred_list = df_train_list.loc[df_train_list['Question Type']==qtype,'PredictPred']
            trainRocAucScore_list = "Acc_" + str(accuracy_score(y_true=ytrue_list,y_pred=ypred_list))


        df_test_list = df_test.loc[testList,].copy()
        ytrue_list = df_test_list.loc[df_test_list['Question Type']==qtype,labelledSection]
        ypred_list = df_test_list.loc[df_test_list['Question Type']==qtype,'PredictProbList']
        try:
            testRocAucScore_list = roc_auc_score(y_true=ytrue_list,y_pred=ypred_list)
        except:
            ypred_list = df_test_list.loc[df_test_list['Question Type']==qtype,'PredictPredList']
            testRocAucScore_list = "Acc_" + str(accuracy_score(y_true=ytrue_list,y_pred=ypred_list))
        trainListShape = len(trainList)
        testListShape = len(testList)

        
    else:
        trainRocAucScore_list = None
        testRocAucScore_list = None
        trainListShape = 0
        testListShape = 0
    
    
    et = time.time() - st
    print('{:.2f} : Getting Focused ROC-AUC Scores ({})'.format(et,qtype))
    return(trainRocAucScore,testRocAucScore,trainRocAucScore_list,testRocAucScore_list,trainListShape,testListShape)

In [128]:
# custom vocab builder
customVocab = []

# random states
randomOptions = list(np.arange(0,20,1))
# randomOptions = [0]

#test Split

testingSplits = [
    2000,
]


manualTagOptions = [
    True,
    False,
]
rollupsOptions = [
    True,
    False,
]


#tfidf limits
dictOptions = [    
    (False, 1, 300000),
    (True, 2, 3000),
]

#which patron section to look at
patronSectionOptions = [
    'First10',
    'First20',
]

#labels
labelledSectionOptions = [
    'READ_1_vs_2',
    'READ_2_vs_3',
]

#model
modelOptions = [
    'lda',
    'd2v',
]


optionsList = [
    randomOptions,
    manualTagOptions,
    rollupsOptions,
    dictOptions,
    patronSectionOptions,
    labelledSectionOptions,
    modelOptions,
]

optionsLen = [len(x) for x in optionsList]

testtotal = np.prod(optionsLen)
iterationcounter = 0

parameterList = []

In [None]:
for z in tqdm.tqdm(randomOptions):
    randomSeed=z


    for s in testingSplits:
        split = s
        testsplit = rawDdata.shape[0]-split
        trainData = rawDdata[:testsplit]
        testData = rawDdata[testsplit:]
        
        trainShape = trainData.shape[0]
        testShape = testData.shape[0]

        for e in patronSectionOptions:
            patronSection=e 

            for f in manualTagOptions:
                getTags = f

                for r in rollupsOptions:
                    if getTags == True:
                        rollup = r
                    elif getTags == False:
                        if r == True:
                            continue
                        else:
                            rollup = r
                    trainDataTagged = getManualTags(df=trainData,
                                                    collectTags=getTags,
                                                    manualTagsList=manualTags,
                                                    section=patronSection,
                                                    rollups=rollup,
                                                   )
                    testDataTagged = getManualTags(df=testData,
                                                   collectTags=getTags,
                                                   manualTagsList=manualTags,
                                                   section=patronSection,
                                                   rollups=rollup,
                                                  )

                    for g in labelledSectionOptions:
                        labelledSection=g

                        for h in modelOptions:
                            model=h              

                            for m in dictOptions:
                                truncate=m 

                                start_time = time.time()

                                trainDataTRUNC,countDF,countList,tfidfList,remove,vocab,gendict = getTFIDFlimited(
                                    df=trainDataTagged,
                                    truncate=truncate,
                                    section=patronSection,
                                    controlVocab=customVocab,
                                    model=model,
                                )
       

                                if model == 'lda':
                                    ldaModel, representation, ldacolumntags = getLDAmatrix(
                                        df=trainDataTRUNC,
                                        dictionary=gendict,
                                        collectTags=getTags,
                                        random=randomSeed)

                                if model == 'd2v':
                                    d2vModel, representation = doc2vecModel(
                                        df=trainDataTRUNC,
                                        random=randomSeed)                                                
                                    ldacolumntags=None



                                MLPmodel,AUCscore,AUCscorelist,AUCscoreNOTlist,trainLlen,trainNLlen,trainDataFinal,trainFinalRep = trainModel(
                                    df=trainDataTRUNC,
                                    reps=representation,
                                    targetLabels=labelledSection,
                                    random=randomSeed,
                                    tagState=getTags,)


                                testDataPrepped = prepareTestingData(
                                    df=testDataTagged,
                                    vocab=vocab,
                                    section=patronSection,)


                                testAUC,testAUClist,testAUCNOTlist,testLlen,testNLlen,testDataFinal,testFinalRep = testModel(
                                    df=testDataPrepped,
                                    targetLabels=labelledSection,
                                    modelType=model,
                                    dictionary=gendict,
                                    neural_model=MLPmodel,
                                    ldajoincolumns=ldacolumntags,
                                    collectTags=getTags,
                                    tagState=getTags,)
    


                                end_time = time.time()
                                total_time = end_time-start_time

                                modelParameters = {
                                    
                                    'TRUNC':patronSection,
                                    'O-Core':getTags,
                                    'O-Core+Super':rollup,
                                    'DICT':str(truncate),
                                    'REPRESENT':model,
                                    'READ':labelledSection,
                                    'AUC_train':AUCscore,
                                    'AUC_test':testAUC,

                                    'AUC_train_Onto':AUCscorelist,
                                    'AUC_test_Onto':testAUClist,
                                    'AUC_train_Not_Onto':AUCscoreNOTlist,
                                    'AUC_test_Not_Onto':testAUCNOTlist,

                                    'AUC_train_Onto_LEN':trainLlen,
                                    'AUC_train_OntoN_LEN':trainNLlen,
                                    'AUC_test_Onto_LEN':testLlen,
                                    'AUC_test_OntoN_LEN':testNLlen,
                                    
                                    
                                    'CycleTime':total_time,
                                    'RAND':randomSeed,
                                    'Custom Vocab Len':len(customVocab),
                                    'Full Vocab Len':len(vocab),
                                    'Testing Split':testsplit,
                                    'Training Data Full':trainShape,
                                    'Testing Data Full':testShape,
                                    'Training Data Final':trainDataFinal.shape[0],
                                    'Testing Data Final':testDataFinal.shape[0],
                                                  }

                                parameterList.append(list(modelParameters.values()))
                                print(modelParameters.values())
                                iterationcounter += 1
                                print("{} out of {} complete".format(iterationcounter,testtotal))

parameterDataFrame = pd.DataFrame(parameterList,columns=list(modelParameters.keys()))

In [131]:
parameterDataFrame.head()

Unnamed: 0,TRUNC,O-Core,O-Core+Super,DICT,REPRESENT,READ,AUC_train,AUC_test,AUC_train_Onto,AUC_test_Onto,...,AUC_test_OntoN_LEN,CycleTime,RAND,Custom Vocab Len,Full Vocab Len,Testing Split,Training Data Full,Testing Data Full,Training Data Final,Testing Data Final
0,First10,True,True,"(False, 1, 300000)",lda,READ_1_vs_2,0.712627,0.658789,0.81698,0.729856,...,1413,36.252905,0,0,7422,12604,12604,2000,10162,1753
1,First10,True,True,"(True, 2, 3000)",lda,READ_1_vs_2,0.706274,0.662781,0.813895,0.731202,...,1413,28.376017,0,0,2967,12604,12604,2000,10162,1753
2,First10,True,True,"(False, 1, 300000)",d2v,READ_1_vs_2,0.72606,0.681149,0.798695,0.726875,...,1413,31.990057,0,0,7422,12604,12604,2000,10162,1753
3,First10,True,True,"(True, 2, 3000)",d2v,READ_1_vs_2,0.709961,0.670547,0.791796,0.730337,...,1413,25.85572,0,0,2967,12604,12604,2000,10162,1753
4,First10,True,True,"(False, 1, 300000)",lda,READ_2_vs_3,0.703218,0.644213,0.824265,0.768131,...,1413,35.527993,0,0,7422,12604,12604,2000,10162,1753


In [132]:
parameterDataFrame.columns

Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',
       'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',
       'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',
       'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',
       'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',
       'Testing Split', 'Training Data Full', 'Testing Data Full',
       'Training Data Final', 'Testing Data Final'],
      dtype='object')

In [133]:
parameterDataFrame.head()

Unnamed: 0,TRUNC,O-Core,O-Core+Super,DICT,REPRESENT,READ,AUC_train,AUC_test,AUC_train_Onto,AUC_test_Onto,...,AUC_test_OntoN_LEN,CycleTime,RAND,Custom Vocab Len,Full Vocab Len,Testing Split,Training Data Full,Testing Data Full,Training Data Final,Testing Data Final
0,First10,True,True,"(False, 1, 300000)",lda,READ_1_vs_2,0.712627,0.658789,0.81698,0.729856,...,1413,36.252905,0,0,7422,12604,12604,2000,10162,1753
1,First10,True,True,"(True, 2, 3000)",lda,READ_1_vs_2,0.706274,0.662781,0.813895,0.731202,...,1413,28.376017,0,0,2967,12604,12604,2000,10162,1753
2,First10,True,True,"(False, 1, 300000)",d2v,READ_1_vs_2,0.72606,0.681149,0.798695,0.726875,...,1413,31.990057,0,0,7422,12604,12604,2000,10162,1753
3,First10,True,True,"(True, 2, 3000)",d2v,READ_1_vs_2,0.709961,0.670547,0.791796,0.730337,...,1413,25.85572,0,0,2967,12604,12604,2000,10162,1753
4,First10,True,True,"(False, 1, 300000)",lda,READ_2_vs_3,0.703218,0.644213,0.824265,0.768131,...,1413,35.527993,0,0,7422,12604,12604,2000,10162,1753


In [134]:
parameterDataFrame.iloc[:,10:20].head()

Unnamed: 0,AUC_train_Not_Onto,AUC_test_Not_Onto,AUC_train_Onto_LEN,AUC_train_OntoN_LEN,AUC_test_Onto_LEN,AUC_test_OntoN_LEN,CycleTime,RAND,Custom Vocab Len,Full Vocab Len
0,0.68072,0.63751,1832,8330,340,1413,36.252905,0,0,7422
1,0.673145,0.644253,1832,8330,340,1413,28.376017,0,0,2967
2,0.70406,0.665981,1832,8330,340,1413,31.990057,0,0,7422
3,0.684704,0.652069,1832,8330,340,1413,25.85572,0,0,2967
4,0.670593,0.602206,1832,8330,340,1413,35.527993,0,0,7422


In [137]:
parameterDataFrame.to_csv('{}_Run.csv'.format(today).format(today))

In [138]:
preservedDataFrame = pd.read_csv('{}_Run.csv'.format(today),index_col=0)
preservedDataFrame.to_csv('{}_preserveRun.csv'.format(today))

In [139]:
preservedDataFrame = pd.read_csv('{}_preserveRun.csv'.format(today),index_col=0)

In [140]:
preservedDataFrame.shape

(96, 25)

In [141]:
preservedDataFrame.columns

Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',
       'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',
       'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',
       'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',
       'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',
       'Testing Split', 'Training Data Full', 'Testing Data Full',
       'Training Data Final', 'Testing Data Final'],
      dtype='object')

In [142]:
preservedDataFrame['O-Core'] = np.where(preservedDataFrame['O-Core'], 1, 0)
preservedDataFrame['O-Core+Super'] = np.where(preservedDataFrame['O-Core+Super'], 1, 0)
preservedDataFrame['TRUNC'] = np.where(preservedDataFrame['TRUNC'] == 'First20', 2,
                                                 np.where(preservedDataFrame['TRUNC'] == 'First10', 1, 0))
preservedDataFrame['DICT'] = np.where(preservedDataFrame['DICT'] == """(True, 2, 3000)""", 1, 0)

preservedDataFrame['READ'] = np.where(preservedDataFrame['READ'] == 'READ_1_vs_2', 0, 1)
preservedDataFrame['REPRESENT'] = np.where(preservedDataFrame['REPRESENT'] == 'lda', 0, 1)

In [143]:
preservedDataFrame.columns

Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',
       'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',
       'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',
       'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',
       'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',
       'Testing Split', 'Training Data Full', 'Testing Data Full',
       'Training Data Final', 'Testing Data Final'],
      dtype='object')

In [144]:
preservedDataFrame.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
            51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
            68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
            85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95],
           dtype='int64')

In [145]:
preservedDataFrame.head(10)

Unnamed: 0,TRUNC,O-Core,O-Core+Super,DICT,REPRESENT,READ,AUC_train,AUC_test,AUC_train_Onto,AUC_test_Onto,...,AUC_test_OntoN_LEN,CycleTime,RAND,Custom Vocab Len,Full Vocab Len,Testing Split,Training Data Full,Testing Data Full,Training Data Final,Testing Data Final
0,1,1,1,0,0,0,0.712627,0.658789,0.81698,0.729856,...,1413,36.252905,0,0,7422,12604,12604,2000,10162,1753
1,1,1,1,1,0,0,0.706274,0.662781,0.813895,0.731202,...,1413,28.376017,0,0,2967,12604,12604,2000,10162,1753
2,1,1,1,0,1,0,0.72606,0.681149,0.798695,0.726875,...,1413,31.990057,0,0,7422,12604,12604,2000,10162,1753
3,1,1,1,1,1,0,0.709961,0.670547,0.791796,0.730337,...,1413,25.85572,0,0,2967,12604,12604,2000,10162,1753
4,1,1,1,0,0,1,0.703218,0.644213,0.824265,0.768131,...,1413,35.527993,0,0,7422,12604,12604,2000,10162,1753
5,1,1,1,1,0,1,0.697165,0.632639,0.822627,0.739773,...,1413,28.378,0,0,2967,12604,12604,2000,10162,1753
6,1,1,1,0,1,1,0.723824,0.649519,0.782196,0.7456,...,1413,30.565002,0,0,7422,12604,12604,2000,10162,1753
7,1,1,1,1,1,1,0.70408,0.647882,0.770538,0.716672,...,1413,25.69,0,0,2967,12604,12604,2000,10162,1753
8,1,1,0,0,0,0,0.704254,0.658059,0.812978,0.716202,...,1413,36.218996,0,0,7422,12604,12604,2000,10162,1753
9,1,1,0,1,0,0,0.701096,0.650616,0.813877,0.722212,...,1413,28.417001,0,0,2967,12604,12604,2000,10162,1753


In [146]:
preservedDataFrame.shape

(96, 25)

In [147]:
preservedDataFrame.columns

Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',
       'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',
       'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',
       'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',
       'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',
       'Testing Split', 'Training Data Full', 'Testing Data Full',
       'Training Data Final', 'Testing Data Final'],
      dtype='object')

In [148]:
deltaColumns = [("D_"+x) for x in preservedDataFrame.iloc[:,6:].columns]

In [149]:
combos = list(itertools.combinations(list(preservedDataFrame.index),2))
headers = ['TRUNC','O-Core','O-Core+Super','READ', 'REPRESENT','DICT','RAND']

combolist = [list(x) for x in combos]

pairwiseDataFrame = pd.DataFrame(columns=deltaColumns, index=range(len(combolist)))

m1 = []
m2 = []

for i in combolist:
    m1.append(i[0])
    m2.append(i[1])
    
pairwiseDataFrame['M1'] = m1
pairwiseDataFrame['M2'] = m2

In [150]:
pairwiseDataFrame.shape

(4560, 21)

In [151]:
for i in headers:
    print(i)

TRUNC
O-Core
O-Core+Super
READ
REPRESENT
DICT
RAND


In [152]:
pairwiseDataFrame.columns

Index(['D_AUC_train', 'D_AUC_test', 'D_AUC_train_Onto', 'D_AUC_test_Onto',
       'D_AUC_train_Not_Onto', 'D_AUC_test_Not_Onto', 'D_AUC_train_Onto_LEN',
       'D_AUC_train_OntoN_LEN', 'D_AUC_test_Onto_LEN', 'D_AUC_test_OntoN_LEN',
       'D_CycleTime', 'D_RAND', 'D_Custom Vocab Len', 'D_Full Vocab Len',
       'D_Testing Split', 'D_Training Data Full', 'D_Testing Data Full',
       'D_Training Data Final', 'D_Testing Data Final', 'M1', 'M2'],
      dtype='object')

In [153]:
pairwiseDataFrame.head()

Unnamed: 0,D_AUC_train,D_AUC_test,D_AUC_train_Onto,D_AUC_test_Onto,D_AUC_train_Not_Onto,D_AUC_test_Not_Onto,D_AUC_train_Onto_LEN,D_AUC_train_OntoN_LEN,D_AUC_test_Onto_LEN,D_AUC_test_OntoN_LEN,...,D_RAND,D_Custom Vocab Len,D_Full Vocab Len,D_Testing Split,D_Training Data Full,D_Testing Data Full,D_Training Data Final,D_Testing Data Final,M1,M2
0,,,,,,,,,,,...,,,,,,,,,0,1
1,,,,,,,,,,,...,,,,,,,,,0,2
2,,,,,,,,,,,...,,,,,,,,,0,3
3,,,,,,,,,,,...,,,,,,,,,0,4
4,,,,,,,,,,,...,,,,,,,,,0,5


In [154]:
pairwiseDataFrame.shape

(4560, 21)

In [155]:
breakpoints = np.arange(0,1900000,1000)

In [156]:
breakpoints

array([      0,    1000,    2000, ..., 1897000, 1898000, 1899000])

In [None]:
for k in range(len(breakpoints)-1):
    print("BREAKPOINT {}-{}".format(breakpoints[k],breakpoints[k+1]))
    
    start = breakpoints[k]
    end = breakpoints[k+1]    
    
    fseries = pd.Series()

    for i in tqdm.tqdm(pairwiseDataFrame.index[start:end]):
        m1 = pairwiseDataFrame.loc[i,'M1']
        m2 = pairwiseDataFrame.loc[i,'M2']

        factors = str()
        counter = 0

        if preservedDataFrame.loc[m1,headers[0]] != preservedDataFrame.loc[m2,headers[0]]:
            factors = factors + headers[0] + " "
            counter += 1  

        if preservedDataFrame.loc[m1,headers[1]] != preservedDataFrame.loc[m2,headers[1]]:
            factors = factors + headers[1] + " "
            counter += 1  

        if preservedDataFrame.loc[m1,headers[2]] != preservedDataFrame.loc[m2,headers[2]]:
            factors = factors + headers[2] + " "
            counter += 1   

        if preservedDataFrame.loc[m1,headers[3]] != preservedDataFrame.loc[m2,headers[3]]:
            factors = factors + headers[3] + " "
            counter += 1  

        if preservedDataFrame.loc[m1,headers[4]] != preservedDataFrame.loc[m2,headers[4]]:
            factors = factors + headers[4] + " "
            counter += 1  

        if preservedDataFrame.loc[m1,headers[5]] != preservedDataFrame.loc[m2,headers[5]]:
            factors = factors + headers[5] + " "
            counter += 1  

        if preservedDataFrame.loc[m1,headers[6]] != preservedDataFrame.loc[m2,headers[6]]:
            factors = factors + headers[6] + " "
            counter += 1 
            
        fseries.loc[i] = factors


    pairwiseDataFrame.loc[fseries.index,'Factor'] = fseries


In [159]:
pairwiseDataFrame.head()

Unnamed: 0,D_AUC_train,D_AUC_test,D_AUC_train_Onto,D_AUC_test_Onto,D_AUC_train_Not_Onto,D_AUC_test_Not_Onto,D_AUC_train_Onto_LEN,D_AUC_train_OntoN_LEN,D_AUC_test_Onto_LEN,D_AUC_test_OntoN_LEN,...,D_Custom Vocab Len,D_Full Vocab Len,D_Testing Split,D_Training Data Full,D_Testing Data Full,D_Training Data Final,D_Testing Data Final,M1,M2,Factor
0,,,,,,,,,,,...,,,,,,,,0,1,DICT
1,,,,,,,,,,,...,,,,,,,,0,2,REPRESENT
2,,,,,,,,,,,...,,,,,,,,0,3,REPRESENT DICT
3,,,,,,,,,,,...,,,,,,,,0,4,READ
4,,,,,,,,,,,...,,,,,,,,0,5,READ DICT


In [160]:
pairwiseDataFrame['Factor']

0                                          DICT 
1                                     REPRESENT 
2                                REPRESENT DICT 
3                                          READ 
4                                     READ DICT 
5                                READ REPRESENT 
6                           READ REPRESENT DICT 
7                                  O-Core+Super 
8                             O-Core+Super DICT 
9                        O-Core+Super REPRESENT 
10                  O-Core+Super REPRESENT DICT 
11                            O-Core+Super READ 
12                       O-Core+Super READ DICT 
13                  O-Core+Super READ REPRESENT 
14             O-Core+Super READ REPRESENT DICT 
15                          O-Core O-Core+Super 
16                     O-Core O-Core+Super DICT 
17                O-Core O-Core+Super REPRESENT 
18           O-Core O-Core+Super REPRESENT DICT 
19                     O-Core O-Core+Super READ 
20                O-

In [161]:
pairwiseDataFrame.to_csv('{}_pairwise.csv'.format(today))

In [162]:
pairwiseDataFrame = pd.read_csv('{}_pairwise.csv'.format(today),index_col=0)

In [163]:
pairwiseDataFrame.head()

Unnamed: 0,D_AUC_train,D_AUC_test,D_AUC_train_Onto,D_AUC_test_Onto,D_AUC_train_Not_Onto,D_AUC_test_Not_Onto,D_AUC_train_Onto_LEN,D_AUC_train_OntoN_LEN,D_AUC_test_Onto_LEN,D_AUC_test_OntoN_LEN,...,D_Custom Vocab Len,D_Full Vocab Len,D_Testing Split,D_Training Data Full,D_Testing Data Full,D_Training Data Final,D_Testing Data Final,M1,M2,Factor
0,,,,,,,,,,,...,,,,,,,,0,1,DICT
1,,,,,,,,,,,...,,,,,,,,0,2,REPRESENT
2,,,,,,,,,,,...,,,,,,,,0,3,REPRESENT DICT
3,,,,,,,,,,,...,,,,,,,,0,4,READ
4,,,,,,,,,,,...,,,,,,,,0,5,READ DICT


In [164]:
splits = pairwiseDataFrame['Factor'].str.split()

In [165]:
pairwiseDataFrame['FactorLength'] = [len(x) for x in splits]

In [166]:
pairwiseDataFrame.head()

Unnamed: 0,D_AUC_train,D_AUC_test,D_AUC_train_Onto,D_AUC_test_Onto,D_AUC_train_Not_Onto,D_AUC_test_Not_Onto,D_AUC_train_Onto_LEN,D_AUC_train_OntoN_LEN,D_AUC_test_Onto_LEN,D_AUC_test_OntoN_LEN,...,D_Full Vocab Len,D_Testing Split,D_Training Data Full,D_Testing Data Full,D_Training Data Final,D_Testing Data Final,M1,M2,Factor,FactorLength
0,,,,,,,,,,,...,,,,,,,0,1,DICT,1
1,,,,,,,,,,,...,,,,,,,0,2,REPRESENT,1
2,,,,,,,,,,,...,,,,,,,0,3,REPRESENT DICT,2
3,,,,,,,,,,,...,,,,,,,0,4,READ,1
4,,,,,,,,,,,...,,,,,,,0,5,READ DICT,2


In [167]:
pairwiseDataFrame.shape

(4560, 23)

In [168]:
pairwiseDataFrame['Neighbor'] = np.where(pairwiseDataFrame['FactorLength']==1,1,0)

In [169]:
pairwiseDataFrame.head()

Unnamed: 0,D_AUC_train,D_AUC_test,D_AUC_train_Onto,D_AUC_test_Onto,D_AUC_train_Not_Onto,D_AUC_test_Not_Onto,D_AUC_train_Onto_LEN,D_AUC_train_OntoN_LEN,D_AUC_test_Onto_LEN,D_AUC_test_OntoN_LEN,...,D_Testing Split,D_Training Data Full,D_Testing Data Full,D_Training Data Final,D_Testing Data Final,M1,M2,Factor,FactorLength,Neighbor
0,,,,,,,,,,,...,,,,,,0,1,DICT,1,1
1,,,,,,,,,,,...,,,,,,0,2,REPRESENT,1,1
2,,,,,,,,,,,...,,,,,,0,3,REPRESENT DICT,2,0
3,,,,,,,,,,,...,,,,,,0,4,READ,1,1
4,,,,,,,,,,,...,,,,,,0,5,READ DICT,2,0


In [170]:
pairwiseDataFrame.shape

(4560, 24)

In [171]:
pairwiseDataFrame.to_csv('{}_pairwise.csv'.format(today))

In [172]:
pairwiseDataFrame = pd.read_csv('{}_pairwise.csv'.format(today),index_col=0)

In [173]:
filteredPairWise = pairwiseDataFrame.loc[pairwiseDataFrame['Neighbor']==1,].copy()

In [174]:
filteredPairWise.shape

(304, 24)

In [175]:
filteredPairWise['Factor'] = filteredPairWise['Factor'].str.strip()

In [176]:
filteredPairWise.head()

Unnamed: 0,D_AUC_train,D_AUC_test,D_AUC_train_Onto,D_AUC_test_Onto,D_AUC_train_Not_Onto,D_AUC_test_Not_Onto,D_AUC_train_Onto_LEN,D_AUC_train_OntoN_LEN,D_AUC_test_Onto_LEN,D_AUC_test_OntoN_LEN,...,D_Testing Split,D_Training Data Full,D_Testing Data Full,D_Training Data Final,D_Testing Data Final,M1,M2,Factor,FactorLength,Neighbor
0,,,,,,,,,,,...,,,,,,0,1,DICT,1,1
1,,,,,,,,,,,...,,,,,,0,2,REPRESENT,1,1
3,,,,,,,,,,,...,,,,,,0,4,READ,1,1
7,,,,,,,,,,,...,,,,,,0,8,O-Core+Super,1,1
23,,,,,,,,,,,...,,,,,,0,24,TRUNC,1,1


In [177]:
filteredPairWise.shape

(304, 24)

In [178]:
filteredPairWise.columns

Index(['D_AUC_train', 'D_AUC_test', 'D_AUC_train_Onto', 'D_AUC_test_Onto',
       'D_AUC_train_Not_Onto', 'D_AUC_test_Not_Onto', 'D_AUC_train_Onto_LEN',
       'D_AUC_train_OntoN_LEN', 'D_AUC_test_Onto_LEN', 'D_AUC_test_OntoN_LEN',
       'D_CycleTime', 'D_RAND', 'D_Custom Vocab Len', 'D_Full Vocab Len',
       'D_Testing Split', 'D_Training Data Full', 'D_Testing Data Full',
       'D_Training Data Final', 'D_Testing Data Final', 'M1', 'M2', 'Factor',
       'FactorLength', 'Neighbor'],
      dtype='object')

In [179]:
preservedDataFrame.columns

Index(['TRUNC', 'O-Core', 'O-Core+Super', 'DICT', 'REPRESENT', 'READ',
       'AUC_train', 'AUC_test', 'AUC_train_Onto', 'AUC_test_Onto',
       'AUC_train_Not_Onto', 'AUC_test_Not_Onto', 'AUC_train_Onto_LEN',
       'AUC_train_OntoN_LEN', 'AUC_test_Onto_LEN', 'AUC_test_OntoN_LEN',
       'CycleTime', 'RAND', 'Custom Vocab Len', 'Full Vocab Len',
       'Testing Split', 'Training Data Full', 'Testing Data Full',
       'Training Data Final', 'Testing Data Final'],
      dtype='object')

In [180]:
for i in tqdm.tqdm(filteredPairWise.index):
    for k in filteredPairWise.columns[:-5]:
        m1 = filteredPairWise.loc[i,'M1']
        m2 = filteredPairWise.loc[i,'M2']
        factor = filteredPairWise.loc[i,'Factor']
        try:
            if preservedDataFrame.loc[m1,factor] > preservedDataFrame.loc[m2,factor]:
                filteredPairWise.loc[i,k] = preservedDataFrame.loc[m1,k[2:]] - preservedDataFrame.loc[m2,k[2:]]
            else:
                filteredPairWise.loc[i,k] = preservedDataFrame.loc[m2,k[2:]] - preservedDataFrame.loc[m1,k[2:]]
        except:
            continue



100%|██████████| 304/304 [00:02<00:00, 123.83it/s]


In [181]:
filteredPairWise.head()

Unnamed: 0,D_AUC_train,D_AUC_test,D_AUC_train_Onto,D_AUC_test_Onto,D_AUC_train_Not_Onto,D_AUC_test_Not_Onto,D_AUC_train_Onto_LEN,D_AUC_train_OntoN_LEN,D_AUC_test_Onto_LEN,D_AUC_test_OntoN_LEN,...,D_Testing Split,D_Training Data Full,D_Testing Data Full,D_Training Data Final,D_Testing Data Final,M1,M2,Factor,FactorLength,Neighbor
0,-0.006353,0.003992,-0.003085,0.001346,-0.007575,0.006743,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,1,DICT,1,1
1,0.013433,0.022359,-0.018285,-0.002981,0.02334,0.028472,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,2,REPRESENT,1,1
3,-0.009409,-0.014576,0.007286,0.038275,-0.010127,-0.035304,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,4,READ,1,1
7,0.008373,0.000731,0.004001,0.013654,0.010689,-0.002168,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,8,O-Core+Super,1,1
23,0.011228,0.033588,-0.017959,0.003631,-0.000128,0.029411,1201.0,-1201.0,209.0,-209.0,...,0.0,0.0,0.0,0.0,0.0,0,24,TRUNC,1,1


In [182]:
filteredPairWise.shape

(304, 24)

In [183]:
filteredPairWise.to_csv('{}_filteredPairWise.csv'.format(today))

In [184]:
filteredPairWise.loc[filteredPairWise['Factor']=='O-Core',].describe()[1:3].transpose()*100

Unnamed: 0,mean,std
D_AUC_train,0.866374,0.82597
D_AUC_test,0.238595,0.783248
D_AUC_train_Onto,7.646677,3.262931
D_AUC_test_Onto,6.42681,2.847169
D_AUC_train_Not_Onto,-1.675274,0.740432
D_AUC_test_Not_Onto,-2.486323,1.080627
D_AUC_train_Onto_LEN,243250.0,61010.861011
D_AUC_train_OntoN_LEN,772950.0,61010.861011
D_AUC_test_Onto_LEN,44450.0,10617.210617
D_AUC_test_OntoN_LEN,130850.0,10617.210617


In [185]:
filteredPairWise.loc[:,['Factor','D_AUC_train','D_AUC_test','D_AUC_train__Onto','D_AUC_test__Onto',]].sort_values(by='D_AUC_test',ascending=False).groupby('Factor').describe()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0_level_0,D_AUC_test,D_AUC_test,D_AUC_test,D_AUC_test,D_AUC_test,D_AUC_test,D_AUC_test,D_AUC_test,D_AUC_test__Onto,D_AUC_test__Onto,...,D_AUC_train,D_AUC_train,D_AUC_train__Onto,D_AUC_train__Onto,D_AUC_train__Onto,D_AUC_train__Onto,D_AUC_train__Onto,D_AUC_train__Onto,D_AUC_train__Onto,D_AUC_train__Onto
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Factor,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
DICT,48.0,-0.003197,0.010142,-0.034942,-0.009256,-0.003648,0.004984,0.014587,0.0,,...,0.001,0.012956,0.0,,,,,,,
O-Core,32.0,0.002386,0.007832,-0.010105,-0.003225,0.00159,0.005636,0.021869,0.0,,...,0.015727,0.029259,0.0,,,,,,,
O-Core+Super,32.0,0.001154,0.005003,-0.010379,-0.002449,0.001353,0.004869,0.012165,0.0,,...,0.001584,0.009733,0.0,,,,,,,
RAND,48.0,0.004587,0.010032,-0.018929,-0.000477,0.00492,0.010752,0.027761,0.0,,...,0.002707,0.027287,0.0,,,,,,,
READ,48.0,-0.023203,0.009845,-0.038659,-0.031895,-0.023738,-0.014459,0.001918,0.0,,...,0.013142,0.023327,0.0,,,,,,,
REPRESENT,48.0,0.012002,0.010824,-0.015564,0.006547,0.012238,0.019525,0.031153,0.0,,...,0.037894,0.066563,0.0,,,,,,,
TRUNC,48.0,0.035339,0.012115,0.008978,0.026802,0.037289,0.043767,0.060891,0.0,,...,0.04882,0.061448,0.0,,,,,,,


In [186]:
filteredPairWise.loc[filteredPairWise['Factor']=='O-Core',].shape

(32, 24)

In [187]:
filteredPairWise.sort_values(by='D_AUC_test',ascending=False).groupby('Factor').describe().to_csv("{}_consolidated.csv".format(today))

# ONLY RUN THIS CODE WHEN PRODUCING A SINGLE MODEL-RUN

### This section is useful for analyzing appropriate LDA topic model sizes and Doc2Vec vector performance

In [188]:
# for i in trainDataFinal['filteredTexts'][:20]:
#     print(i)
#     print()
    
# print(trainDataFinal.loc[11639,"tokenizedTexts"])

In [189]:
# tokenizedTexts = trainDataFinal.loc[:,"tokenizedTexts"].copy()

# cdict = Dictionary(tokenizedTexts)

# tokenizedList_bow = [cdict.doc2bow(x) for x in tokenizedTexts]

# x = list(np.arange(2,200,10))
# perp = []

# for i in tqdm.tqdm(x):
#     print(i)
#     ldaModel = LdaModel(
#         corpus=tokenizedList_bow,
#         id2word=cdict, 
#         num_topics=i,
#         random_state=42,
#         passes=5,
#         alpha=1.0/i,
#         eta=1.0/i, 
#         eval_every=1000, 
#         iterations=5, 
#     )
#     p = ldaModel.log_perplexity(tokenizedList_bow)
#     print(p)
#     perp.append(p)
    
# sns.set(font_scale=1.25)
# plt.figure(figsize=(12,8))
# ax = sns.lineplot(x=x,y=perp,palette='colorblind')
# ax.set_title("LDA Perplexity")
# ax.set(xlabel='Number of Topics in LDA Model', ylabel='Perplexity')
# ax.spines['bottom'].set_color('0.25')
# ax.spines['top'].set_color('0.25')
# ax.spines['right'].set_color('0.25')
# ax.spines['left'].set_color('0.25')
# plt.savefig("plots/LDAperplexity.png")
# plt.show()

In [190]:
# df = pd.DataFrame(columns=[
#     'Tag',
#     'Train Tagged',
#     'Train Not Tagged',
#     'Train diff',
#     'Train ttest',
#     'Train pvalue',
#     'Train Tagged Count',
#     'Train Not Tagged Count',
#     'Test Tagged',
#     'Test Not Tagged',
#     'Test diff',
#     'Test ttest',
#     'Test pvalue',
#     'Test Tagged Count',
#     'Test Not Tagged Count',
#     'vs tagEASIER',
#     'vs tagHARDER',
#     'Intended Label',
#     'Actual Label',
# ])

# cosinePerformance = []

# for i in tagVectors:
#     tag = str(i[0])
#     testList = []
#     testNOTList = []
#     trainList = []
#     trainNOTList = []
#     for k in trainDataFinal.loc[trainFinalRep.index,:].index:
#         if tag in trainDataFinal.loc[k,'manualTags']:
#             trainList.append(k)
#         else:
#             trainNOTList.append(k)
    
#     for k in testDataFinal.loc[testFinalRep.index,:].index:
#         if tag in testDataFinal.loc[k,'manualTags']:
#             testList.append(k)
#         else:
#             testNOTList.append(k)
    
#     reps = pd.concat([trainFinalRep,testFinalRep])
    
    
#     testListCos = []
#     testNOTListCos = []
#     trainListCos = []
#     trainNOTListCos = []    
    
    
#     for k in testList:
#         testListCos.append(cosine_similarity([i[1],reps.loc[k,:]])[0,1])
                           
#     for k in testNOTList:
#         testNOTListCos.append(cosine_similarity([i[1],reps.loc[k,:]])[0,1])
                              
#     for k in trainList:
#         trainListCos.append(cosine_similarity([i[1],reps.loc[k,:]])[0,1])
                            
#     for k in trainNOTList:
#         trainNOTListCos.append(cosine_similarity([i[1],reps.loc[k,:]])[0,1])

    
#     testTTEST = ttest_ind(a=testListCos,b=testNOTListCos,equal_var=False)
#     trainTTEST = ttest_ind(a=trainListCos,b=trainNOTListCos,equal_var=False)
    
#     vseasy = cosine_similarity([i[1],d2vModel['tagEASIER']])[0,1]
#     vshard = cosine_similarity([i[1],d2vModel['tagHARDER']])[0,1]
    
#     if i[0] in tags[:11]:
#         label = 'EASY'
#     else:
#         label = 'HARD'
        
#     if vseasy > vshard:
#         actual = 'EASY'
#     else:
#         actual = 'HARD'
    
    
#     sample = {
#         'Tag':i[0],
        
#         'Train Tagged':np.mean(trainListCos),
#         'Train Not Tagged':np.mean(trainNOTListCos),
#         'Train diff':np.abs(np.mean(trainListCos)-np.mean(trainNOTListCos)),
        
#         'Train ttest':trainTTEST[0],
#         'Train pvalue':trainTTEST[1],
#         'Train Tagged Count':len(trainList),
#         'Train Not Tagged Count':len(trainNOTList),
        
#         'Test Tagged':np.mean(testListCos),
#         'Test Not Tagged':np.mean(testNOTListCos),
#         'Test diff':np.abs(np.mean(testListCos)-np.mean(testNOTListCos)),
        
#         'Test ttest':testTTEST[0],
#         'Test pvalue':testTTEST[1],
#         'Test Tagged Count':len(testList),
#         'Test Not Tagged Count':len(testNOTList),
        
#         'vs tagEASIER':vseasy,
#         'vs tagHARDER':vshard,
#         'Intended Label':label,
#         'Actual Label':actual,
        
#     }
# #     print(sample)
#     cosinePerformance.append(sample)
# pd.DataFrame(cosinePerformance,columns=df.columns).to_csv('ttestOntology.csv')