In [2]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from matplotlib import pyplot as plt
from collections import Counter

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/EECS 448/Project

Mounted at /content/drive
/content/drive/MyDrive/EECS 448/Project


In [40]:
### PREPROCESS FUNCTIONS ###

def PreprocessSentence(sentence, nltk_stopwords):
    personality_types = ["intj", "entj", "intp", "entp", "infj", "enfj", "infp", "enfp", "istj", "estj", "istp", "estp", "isfj", "esfj", "isfp", "esfp"]
    sentence = sentence.lower()
    ps = PorterStemmer()
    nltk_tokens = nltk.word_tokenize(sentence)
    words = []
    for long_token in nltk_tokens:
        token = ps.stem(long_token)
        if token not in nltk_stopwords and token not in personality_types and token.isalnum():
            words.append(token)
    return ' '.join(words)
    

def PreprocessData(df, nltk_stopwords):
    # Input - the dataframe, with columns label and text
    # output - the dataframe with the text processed
    df["posts"] = df["posts"].map(lambda sentence : PreprocessSentence(sentence, nltk_stopwords))
    return df
    

In [None]:
### LOAD AND PREPROCESS DATASET ###

nltk_stopwords = set(stopwords.words('english'))

df = pd.read_csv('./raw_reddit_mbti.csv')
df = PreprocessData(df, nltk_stopwords)
df['posts'][342] # Look at a preprocessed post

In [None]:
### SAVE FULL LENGTH DATASET ###
df.to_csv(path_or_buf = "./pain_processed.csv", index=False)

In [None]:
### LOOK AT POST LENGTH DISTRIBUTION ###

# kaggle = pd.read_csv('./EECS_448_Project_Team/datasets/processed_kaggle.csv')

lengths = []
for post in df["posts"]: lengths.append(len(post))
lengths = sorted(lengths)

plt.hist(lengths[300:3500], 100)
plt.show()

In [None]:
### LOOK AT FREQUENCY OF WORDS ###

count = Counter()
for post in df["posts"]:
    tokens = word_tokenize(post)
    count.update(tokens)

sorted_tokens = sorted(count.items(), key=lambda x:x[1], reverse=True)

# Cut-off seems around 20 when we stop seeing usernames and start seeing words
for token in sorted_tokens:
  if token[1] == 16:
    print(token[0])
print(len(sorted_tokens))

In [15]:
### READ IN DATASET TO SKIP ALL EARLIER CELLS ###
full_reddit = pd.read_csv('./mbti_processed.csv')

In [14]:
### REMOVE WORDS USED LESS THAN <THRESHOLD> TIMES ###

# Get word counts 
word_count = {}
for post in full_reddit["posts"]:
  if isinstance(post, str):
    tokens = word_tokenize(post)
    for token in tokens:
      word_count[token] = word_count.get(token, 0) + 1

# Remove infrequent words
threshold = 20
for post in full_reddit["posts"]:
  if isinstance(post, str):
    tokens = word_tokenize(post)
    for token in tokens:
      if word_count[token] < threshold:
        tokens.remove(token)
    post = ' '.join(tokens)

In [16]:
### DROP USERS THAT HAVE TOO FEW OR MANY CHARACTERS ###

low = 300
high = 50000
drop_list = []
for idx, post in enumerate(full_reddit["posts"]):
  if not isinstance(post, str) or (len(post) < low) or (len(post) > high):
    drop_list.append(idx)
cleaned_reddit = full_reddit.drop(drop_list, 'index')

  cleaned_reddit = full_reddit.drop(drop_list, 'index')


In [17]:
print(len(cleaned_reddit))
cleaned_reddit.head()

3666


Unnamed: 0,posts
1,agre tint lip balm lipstick look realli good l...
2,pet shop got ta make care isnt ani vitamin e a...
3,combo jack tri desktop split work fine issu st...
4,ahh catch well doctor alway told even type oi ...
5,hope get aggress fast somehow find peopl docto...


In [18]:
### SAVE FINALIZED DATASET ###
cleaned_reddit.to_csv(path_or_buf = "./processed_reddit_pain.csv", index=False)

In [20]:
### FIGURE OUT CLASS BALANCES ###
reddit_mbti = pd.read_csv('./processed_reddit_mbti.csv')

types = {}
for mbti_type in reddit_mbti["type"]:
 for char in mbti_type:
   types[char] = types.get(char, 0) + 1
  
print(types)

{'e': 2451, 's': 1290, 'f': 3345, 'j': 3024, 'n': 6428, 'p': 4694, 'i': 5267, 't': 4373}


In [10]:
### Set types to uppercase ###
reddit_mbti = pd.read_csv('./processed_reddit_mbti.csv')
reddit_mbti["type"] = reddit_mbti["type"].str.upper()
reddit_mbti.to_csv(path_or_buf = "./processed_reddit_mbti_upper.csv", index=False)