In [9]:
import pandas as pd
import _pickle as cPickle
import os

# os.environ['KMP_DUPLICATE_LIB_OK']='True'
from sentence_transformers import SentenceTransformer

def load_data(file_name, col_name, log_path = 'output/tmp.txt'):
    """
    Read in input file and load data

    root_dir: a path for data directory
    datafile: a text file for saving output

    return: X and y dataframe
    """
    output_file = open(log_path, 'a')
    df = pd.read_csv(file_name)
    print("\n********** Data Summary **********\n")
    print(df.shape, "\n")
    print(df.head(3), "\n")
    print(df.info(), "\n")

    print("\n********** Data Summary **********\n", file=output_file)
    print(df.shape, "\n", file=output_file)
    print(df.head(3), "\n", file=output_file)
    print(df.info(), "\n", file=output_file)
    ## Remove duplicates if any and keep first occurrence
    # df.drop_duplicates(subset=['pmid'], keep='first', inplace=True)

    print("\n********** Data Shape after Removing Duplicates **********\n")
    print(df.shape, "\n")

    print("\n********** Data Shape after Removing Duplicates **********\n", file=output_file)
    print(df.shape, "\n", file=output_file)

    # if col_name == 'mix':
    #     df['mix'] = df['title'] + df['abstract']
    ## clean the sign column
    df['sign'] = df['sign'].apply(lambda x: str(x).split(' ')[0])
    df = df[df['sign'] != 'nan']

    df = df[['sign', col_name]]
    df[col_name] = df[col_name].fillna('')
    ## Check if any columns contain null values
    print("\n********** Count of Null Values for Each Column **********\n")
    print(df.isnull().sum(), "\n")

    print("\n********** Count of Null Values for Each Column **********\n", file=output_file)
    print(df.isnull().sum(), "\n", file=output_file)

    ## Drop instances including null values
    df = df.dropna()

    print("\n********** Data Shape after Removing Null Values **********\n")
    print(df.shape, "\n")

    print("\n********** Data Shape after Removing Null Values **********\n", file=output_file)
    print(df.shape, "\n", file=output_file)

    print("\n********** Class Label Distribution **********\n")
    print(df["sign"].value_counts())

    print("\n********** Class Label Distribution **********\n", file=output_file)
    print(df["sign"].value_counts(), file=output_file)
    ## Trim unnecessary spaces for strings
    df[col_name] = df[col_name].apply(lambda x: str(x).strip())
    df = df.reset_index(drop=True)
    ## Split into X and y (target)
    X, y = df.loc[:, col_name], df.loc[:, 'sign']
    output_file.close()
    return X, y


transformer = SentenceTransformer('paraphrase-MiniLM-L6-v2')
data_path = '../data/okcupid_profiles.csv'
col_name = 'essay0'
log_path = '../output/tmp.txt'
for col_name in [f'essay{i}' for i in range(0, 10)]:
    print(f'embedding {col_name}')
    X, y = load_data(data_path, col_name, log_path)
    X = transformer.encode(X)
    with open(f"../embeddings/transformer/transformer_{col_name}.pickle", "wb") as output_file:
        cPickle.dump(X, output_file)
        
with open(f"../embeddings/transformer/y.pickle", "wb") as output_file:
    cPickle.dump(y, output_file)
print(y.shape)

embedding essay0

********** Data Summary **********

(59946, 31) 

   age     status sex orientation       body_type               diet  \
0   22     single   m    straight  a little extra  strictly anything   
1   35     single   m    straight         average       mostly other   
2   38  available   m    straight            thin           anything   

     drinks      drugs                       education     ethnicity  ...  \
0  socially      never   working on college/university  asian, white  ...   
1     often  sometimes           working on space camp         white  ...   
2  socially        NaN  graduated from masters program           NaN  ...   

                                              essay0  \
0  about me:  i would love to think that i was so...   
1  i am a chef: this is what that means. 1. i am ...   
2  i'm not ashamed of much, but writing public te...   

                                              essay1  \
0  currently working as an international agent fo... 

In [11]:
with open(rf"embeddings/transformer/transformer_{"essay1"}.pickle", "rb") as input_file:
     e1 = cPickle.load(input_file)
e1.shape

(48890, 384)

In [ ]:
for i in range(1,10):
    with open(rf"../embeddings/transformer/transformer_essay{i}.pickle", "rb") as input_file:
        e = cPickle.load(input_file)
    e.shape