In [None]:
import pandas as ps
import os
import numpy as np 
from transformers import BertTokenizer, TFBertForMaskedLM
import tensorflow as tf
import transformers
import glob

In [None]:
def mapTextExtract(tweet):
    tmpText = tweet['text'].replace("\n",". ")
    
    return(tmpText + "\n")


In [None]:
def mapConvExtract(tweet):
    try:
        convid = tweet['conversation_id']
    except:
        convid = 'nope'
    return(convid)

In [None]:
def mapIdExtract(tweet):
    return(tweet['id'])

In [None]:
def processSingleYearSingleVariable(tuple):
    year = tuple[0]
    variable = tuple[1]
    folder = "Z:/Aspire/TweetStore/" + str(year) + "/" + str(variable) + "/"
    outputFile = "H:/Aspire/BERT/BERT_text/txt_" + str(year) + "_" + str(variable) + ".txt"
    if(os.path.exists(outputFile)):
        print("%s already exists" %(outputFile))
        return
    tweetFiles = glob.glob(folder + "tw_*")
    text = []
    print("found %i files for variable %s and year %i" %(len(tweetFiles),variable,year))
    for file in tweetFiles:
        text += combineSingleDay(file)
    writeToText(text,outputFile)

In [None]:
def loadSingleBatch(filename):
    tweetBatch = np.load(filename,allow_pickle=True)
    text,convId,tweetId = [],[],[]
    for subBatch in tweetBatch:
        text +=  list(map(mapTextExtract,subBatch))
        convId += list(map(mapConvExtract,subBatch))
        tweetId += list(map(mapIdExtract,subBatch))
    df = ps.DataFrame({
        'text':text,
        'convId':convId,
        'tweetId':tweetId
    })
    if(df.count()[0]>0):
        df = df[~df['convId'].str.contains('nope')]
    return(df)


In [None]:
def setupModelInputs(tweetData,debug=False):
    tweetText = list(tweetData['text'])
 #   tweetText = list(map(mapNewLineReplace,tweetText))
    inputs = TOKEN(tweetText,max_length=100,truncation=True,padding='max_length',return_tensors="tf")
    inp_ids = tf.convert_to_tensor(inputs['input_ids'])
    inputs['input_ids'] = inp_ids
    return(inputs)

In [None]:
def combineSingleDay(inFile):
    text = []
    npData = np.load(inFile,allow_pickle=True)
    for batch in npData:
        text += list(map(mapTextExtract,batch))
    print("found %i tweets for day %s" %(len(text),inFile))
    return(text)

In [None]:
def processSingleYearSingleVariable(year,variable,model):
    inFolder = "/mnt/z/Aspire/TweetStore/" + str(year) + "/" + str(variable) + "/"
    outFolder = "/mnt/h/Aspire/PredStore/" + str(year) + "/" + str(variable) + "/"
    tweetFiles = glob.glob(inFolder + "tw_*")
    for file in tweetFiles:
        dateStamp = file[file.rfind('_')+1:-4]
        outputFile = outFolder + "re_" + dateStamp + ".csv"
        if(os.path.exists(outputFile)):
            print("%s already exists" %(outputFile))
        else:
            processSingleBatch(file,outputFile,model)

In [None]:
modelFile = '/mnt/h/Aspire/BERT/health/health_model_checkpoint.h5'
relaxedBERT = tf.keras.models.load_model(modelFile,custom_objects={"TFBertModel": transformers.TFBertModel})
relaxedBERT.compile(loss=[tf.keras.losses.BinaryCrossentropy()],optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),metrics=['accuracy'])
TOKEN = BertTokenizer.from_pretrained('/mnt/h/Aspire/BERT/health/expandedTokenBase')

In [None]:
def processSingleConvFolder(folder,model):
    inFolder = "/mnt/z/Aspire/TweetStore/conversations/" + folder + "/"
    outFolder = "/mnt/h/Aspire/PredStore/conversations/" + folder + "/"
    tweetFiles = glob.glob(inFolder + "tw_*")
    for file in tweetFiles:
        convId = file[file.rfind('_')+1:-4]
        outputFile = outFolder + "re_" + convId + ".csv"
        if(os.path.exists(outputFile)): 
            a=1
         #   print("%s already exists" %(outputFile))
        else:
            processSingleBatch(file,outputFile,model)

In [None]:
def processSingleBatch(inFile,outFile,model):  
    df = loadSingleBatch(inFile)
    if(df.count()[0]==0):
        #print("%s has 0 records" %(inFile))
        return
    modelInputs = setupModelInputs(df)
    preds = model.predict([modelInputs['input_ids'],modelInputs['attention_mask']])
    df['isCognitive'] = tf.nn.sigmoid(preds[:,0])
    df['isEmotional'] = tf.nn.sigmoid(preds[:,1])
    df['isPhysical'] = tf.nn.sigmoid(preds[:,2])
    df['isPositive'] = tf.nn.sigmoid(preds[:,3])
    df['isNegative'] = tf.nn.sigmoid(preds[:,4])
    df2 = df.drop(['text'],axis=1)
    df2.to_csv(outFile,index=False)

In [None]:
a = np.load("/mnt/z/Aspire/TweetStore/2016/health/tw_20160204.npy",allow_pickle=True)
index=0
for b in a:
    for c in b:
        try:
            d = c['conversation_id']
        except:
            print(index)
        index+=1

In [None]:
parentConvFolder = '/mnt/z/Aspire/TweetStore/conversations/'
outputFolder = '/mnt/h/Aspire/PredStore/conversations/'
conversationFolders = os.listdir(parentConvFolder)
index = 0
for folder in conversationFolders:
    curFolder = parentConvFolder + folder + "/"
    if not(os.path.exists(outputFolder + folder)):
        os.mkdir(outputFolder + folder)
    processSingleConvFolder(folder,relaxedBERT)
    index+=1
    if(index%100==0):
        print(index)

In [14]:
for year in reversed(range(2014,2019)):
    yearFolder = '/mnt/h/Aspire/PredStore/' + str(year) + "/"
    if not(os.path.exists(yearFolder)):
        os.mkdir(yearFolder)
    for cat in['age','place','env','health','health2']:
        print("calculating values for year %i and category %s" %(year,cat))
        catFolder = yearFolder + "/" + str(cat)
        if not(os.path.exists(catFolder)):
            os.mkdir(catFolder)
        processSingleYearSingleVariable(year,cat,relaxedBERT)

   2/1137 [..............................] - ETA: 1:21 

2023-10-31 15:50:15.842729: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


