# Part 1: Environment Setup

In [20]:
import os
os.getcwd()
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from IPython.display import display
import pickle
import sys
import torch
import torch.nn as nn
sys.path.append("..")
def extract(path):
    print(os.getcwd())
    file = open(path, 'rb')
    object_file = pickle.load(file)
    file.close()
    return object_file

def save_dataset(item, dir, name):
    if not os.path.exists(dir):
        os.makedirs(dir)
    path = dir+"/"+name+".pickle"
    pickle.dump(item, open(path, 'wb'))


In [21]:
import re
def clear_text(data):
    cleaned_text=[]
    for sentence in data:
        sentence=sentence.lower()
        # removing links from text data
        sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
    
        # removing other symbols
        sentence=re.sub('[^0-9a-z]',' ',sentence)
        cleaned_text.append(sentence)
    return cleaned_text

In [22]:
def expand(np_data):
    temp=[]
    for i in range(len(np_data)):
        temp.append(np.array(np_data[i]))
    return temp

In [23]:
class Lemmatizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, sentence):
        return [self.lemmatizer.lemmatize(word) for word in sentence.split() if len(word)>2]

vectorizer_mbti=extract('./bow/mbti_vectorizer.pickle')

/Users/jingjietan/Desktop/PRaware/model_aware/personality


In [26]:
# # Part 3 Model Training
class CustomNetwork(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 5)
        self.fc2 = nn.Linear(5, 5)
        self.fc3 = nn.Linear(5, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x
    
def personality_inference(network, series_data):
    predictions = []

    for item in series_data:
        # Convert the item to a tensor and perform necessary preprocessing
        tensor_item = torch.tensor(item).type(torch.FloatTensor)
        tensor_item = tensor_item.to("mps")

        # Forward pass through your network
        outs = network(tensor_item)
        outs = outs.view(-1)
        
        # Apply the threshold (0.5 in your case) to get binary predictions
        preds = outs > 0.5

        # Append the prediction to the list
        preds = preds.cpu().numpy()
        predictions.append(preds)
    
    return predictions


# sentiment

In [28]:
imdb_dataset = pd.read_csv('../../corpus/imdb.csv')
imdb_dataset['sentiment'] = imdb_dataset['sentiment'].map({'positive': 1, 'negative': 0})
imdb_dataset.review = clear_text(imdb_dataset.review)
# use vectorizer_mbti process the review and the content into another column('bow')
imdb_dataset['bow'] = expand(vectorizer_mbti.transform(imdb_dataset.review).toarray())
for dimension in ['O','C','E','A']:
    print(dimension)
    network = torch.load('./bow/'+dimension+'.pt')
    network.eval()
    imdb_dataset[dimension] = personality_inference(network, imdb_dataset["bow"])
# make True to 1 and False to 0
for dimension in ['O','C','E','A']:
    imdb_dataset[dimension] = imdb_dataset[dimension].apply(lambda x: 1 if x else 0)
imdb_dataset = imdb_dataset.drop(columns=['bow'])
save_dataset(imdb_dataset, "../../corpus/personality-aware-sentiment", "bow-imdb")

O
C
E
A


In [41]:
moviereview_dataset = pd.read_csv('../../corpus/movie-review.csv')
moviereview_dataset.content = clear_text(moviereview_dataset.content)
# use vectorizer_mbti process the review and the content into another column('bow')
moviereview_dataset['bow'] = expand(vectorizer_mbti.transform(moviereview_dataset.content).toarray())
for dimension in ['O','C','E','A']:
    print(dimension)
    network = torch.load('./bow/'+dimension+'.pt')
    network.eval()
    moviereview_dataset[dimension] = personality_inference(network, moviereview_dataset["bow"])

# make True to 1 and False to 0
for dimension in ['O','C','E','A']:
    moviereview_dataset[dimension] = moviereview_dataset[dimension].apply(lambda x: 1 if x else 0)
moviereview_dataset = moviereview_dataset.drop(columns=['bow'])

save_dataset(moviereview_dataset, "../../corpus/personality-aware-sentiment", "bow-movie-review")
moviereview_dataset.head()

O
C
E
A


Unnamed: 0,content,label,O,C,E,A
0,bad bad bad that one word seems to pre...,0,1,0,1,1
1,isn t it the ultimate sign of a movie s cinema...,0,1,1,0,1
2,gordy is not a movie it is a 90 minute ...,0,1,1,0,0
3,disconnect the phone line don t accept the ...,0,0,1,0,1
4,when robert forster found himself famous again...,0,0,0,0,1


# depression

In [42]:
sdcnl = pd.read_csv('../../corpus/sdcnl.csv')

# merge title and selftext column together, both have string value
sdcnl['text'] = sdcnl['title'].astype(str) + " | " + sdcnl['selftext'].astype(str)

# drop all column except text and is_suicide column
sdcnl_dataset = sdcnl[['text', 'is_suicide']]

sdcnl_dataset.text = clear_text(sdcnl_dataset.text)
# use vectorizer_mbti process the review and the content into another column('bow')
sdcnl_dataset['bow'] = expand(vectorizer_mbti.transform(sdcnl_dataset.text).toarray())
for dimension in ['O','C','E','A']:
    print(dimension)
    network = torch.load('./bow/'+dimension+'.pt')
    network.eval()
    sdcnl_dataset[dimension] = personality_inference(network, sdcnl_dataset["bow"])
# make True to 1 and False to 0
for dimension in ['O','C','E','A']:
    sdcnl_dataset[dimension] = sdcnl_dataset[dimension].apply(lambda x: 1 if x else 0)
sdcnl_dataset = sdcnl_dataset.drop(columns=['bow'])
save_dataset(sdcnl_dataset, "../../corpus/personality-aware-depression", "bow-sdcnl")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sdcnl_dataset.text = clear_text(sdcnl_dataset.text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sdcnl_dataset['bow'] = expand(vectorizer_mbti.transform(sdcnl_dataset.text).toarray())


O


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sdcnl_dataset[dimension] = personality_inference(network, sdcnl_dataset["bow"])


C


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sdcnl_dataset[dimension] = personality_inference(network, sdcnl_dataset["bow"])


E


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sdcnl_dataset[dimension] = personality_inference(network, sdcnl_dataset["bow"])


A


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sdcnl_dataset[dimension] = personality_inference(network, sdcnl_dataset["bow"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sdcnl_dataset[dimension] = sdcnl_dataset[dimension].apply(lambda x: 1 if x else 0)


In [49]:
twitter = pd.read_csv('../../corpus/Mental-Health-Twitter.csv')
# drop all column except text and is_suicide column
twitter_dataset = twitter[['post_text', 'label']]

twitter_dataset.post_text = clear_text(twitter_dataset.post_text)
# use vectorizer_mbti process the review and the content into another column('bow')
twitter_dataset['bow'] = expand(vectorizer_mbti.transform(twitter_dataset.post_text).toarray())
for dimension in ['O','C','E','A']:
    print(dimension)
    network = torch.load('./bow/'+dimension+'.pt')
    network.eval()
    twitter_dataset[dimension] = personality_inference(network, twitter_dataset["bow"])
# make True to 1 and False to 0
for dimension in ['O','C','E','A']:
    twitter_dataset[dimension] = twitter_dataset[dimension].apply(lambda x: 1 if x else 0)
twitter_dataset = twitter_dataset.drop(columns=['bow'])
save_dataset(twitter_dataset, "../../corpus/personality-aware-depression", "bow-twitter")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_dataset.post_text = clear_text(twitter_dataset.post_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_dataset['bow'] = expand(vectorizer_mbti.transform(twitter_dataset.post_text).toarray())


O


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_dataset[dimension] = personality_inference(network, twitter_dataset["bow"])


C


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_dataset[dimension] = personality_inference(network, twitter_dataset["bow"])


E


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_dataset[dimension] = personality_inference(network, twitter_dataset["bow"])


A


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_dataset[dimension] = personality_inference(network, twitter_dataset["bow"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_dataset[dimension] = twitter_dataset[dimension].apply(lambda x: 1 if x else 0)


In [50]:
twitter_dataset.head()

Unnamed: 0,post_text,label,O,C,E,A
0,it s just over 2 years since i was diagnosed w...,1,1,1,1,1
1,it s sunday i need a break so i m planning t...,1,0,0,1,1
2,awake but tired i need to sleep but my brain ...,1,1,0,1,0
3,rt sewhq retro bears make perfect gifts and...,1,0,1,1,1
4,it s hard to say whether packing lists are mak...,1,0,1,1,1
