In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import pandas as pd
from typing import List

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maxma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maxma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Reading the data

In [2]:
df = pd.read_csv('combined_mturk_mpersonality_fixed.csv')
df

Unnamed: 0,TEXT,TEXT_NL,cEXT,cNEU,cAGR,cCON,cOPN
0,"Well, right now I just woke up from a mid-day ...","Nou, op dit moment werd ik net wakker na een m...",n,y,y,n,y
1,"Well, here we go with the stream of consciousn...","Nou, hier gaan we met de stroom van bewustzijn...",n,n,y,n,n
2,An open keyboard and buttons to push. The thin...,Een open toetsenbord en knoppen om in te drukk...,n,y,n,y,y
3,I can't believe it! It's really happening! M...,Ik kan het niet geloven! Het gebeurt echt! Mij...,y,n,y,y,n
4,"Well, here I go with the good old stream of co...","Welnu, hier ga ik weer met de goede oude stroo...",y,n,y,n,y
...,...,...,...,...,...,...,...
2958,I am motivated on a day to day basis by the ne...,Ik word dagelijks gemotiveerd door de noodzaak...,y,n,n,y,y
2959,My son is the biggest part of my life and with...,Mijn zoon is het grootste deel van mijn leven ...,y,y,n,n,n
2960,My kids and grandkids are what keeps me motiva...,Mijn kinderen en kleinkinderen houden me elke ...,y,n,y,y,n
2961,My biggest drive is to earn money so I can ret...,Mijn grootste drijfveer is om geld te verdiene...,n,n,n,n,n


In [3]:
df['cEXT'] = df['cEXT'].map({'y': 1, 'n': 0})
df['cNEU'] = df['cNEU'].map({'y': 1, 'n': 0})
df['cAGR'] = df['cAGR'].map({'y': 1, 'n': 0})
df['cCON'] = df['cCON'].map({'y': 1, 'n': 0})
df['cOPN'] = df['cOPN'].map({'y': 1, 'n': 0})
df.head()

Unnamed: 0,TEXT,TEXT_NL,cEXT,cNEU,cAGR,cCON,cOPN
0,"Well, right now I just woke up from a mid-day ...","Nou, op dit moment werd ik net wakker na een m...",0,1,1,0,1
1,"Well, here we go with the stream of consciousn...","Nou, hier gaan we met de stroom van bewustzijn...",0,0,1,0,0
2,An open keyboard and buttons to push. The thin...,Een open toetsenbord en knoppen om in te drukk...,0,1,0,1,1
3,I can't believe it! It's really happening! M...,Ik kan het niet geloven! Het gebeurt echt! Mij...,1,0,1,1,0
4,"Well, here I go with the good old stream of co...","Welnu, hier ga ik weer met de goede oude stroo...",1,0,1,0,1


In [4]:
def preprocess_dataframe_content(df: pd.DataFrame, language: str) -> pd.DataFrame:
    """
    Preprocesses each entry in the 'content' column of the given DataFrame by:
    - Lowercasing
    - Keeping only alphabetic characters
    - Removing stopwords
    - Lemmatizing
    - Filtering out words with length less than 3
    Tokenizes the preprocessed entry into a list of words.
    Returns a list of lists where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the 'content' column to be processed.

    Returns:
    - df (pd.DataFrame): The same DataFrame, but with the column of the specified language processed according the above techniques.
    """
    # Prepare lemmatizer and stopwords list
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words(language))

    # Preprocess the content
    
    if language == 'english':
        column = 'TEXT'
    elif language == 'dutch':
        column = 'TEXT_NL'
        
    processed_content = []
    for content in df[column]:
        # Keep only alphabetic characters and lowercased
        tokens = re.sub('[^a-zA-Z\s]', '', content.strip()).split()
        # Remove stopwords and short words, and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) >= 3]
        processed_content.append(tokens)
        
    df[column] = [' '.join(words) for words in processed_content]

    return df

In [5]:
processed_e = preprocess_dataframe_content(df, 'english')
processed_nlc = preprocess_dataframe_content(processed_e, 'dutch')
processed_nlc.head()

Unnamed: 0,TEXT,TEXT_NL,cEXT,cNEU,cAGR,cCON,cOPN
0,Well right woke midday nap Its sort weird ever...,Nou moment net wakker middagdutje Het beetje r...,0,1,1,0,1
1,Well stream consciousness essay used thing lik...,Nou gaan stroom bewustzijn essay deed soort di...,0,0,1,0,0
2,open keyboard button push The thing finally wo...,Een open toetsenbord knoppen drukken Het ding ...,0,1,0,1,1
3,cant believe Its really happening pulse racing...,geloven Het gebeurt echt Mijn pol raast gek Du...,1,0,1,1,0
4,Well good old stream consciousness assignment ...,Welnu weer goede oude stroom bewustzijnstoewij...,1,0,1,0,1


In [7]:
english_df_nlc = processed_nlc[['TEXT', 'cEXT','cNEU','cAGR','cCON','cOPN']]
dutch_df_nlc = processed_nlc[['TEXT_NL', 'cEXT','cNEU','cAGR','cCON','cOPN']]
english_df_nlc.head()

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,Well right woke midday nap Its sort weird ever...,0,1,1,0,1
1,Well stream consciousness essay used thing lik...,0,0,1,0,0
2,open keyboard button push The thing finally wo...,0,1,0,1,1
3,cant believe Its really happening pulse racing...,1,0,1,1,0
4,Well good old stream consciousness assignment ...,1,0,1,0,1


In [8]:
processed_nlc.to_csv('processed_data_full_no_lowercasing.csv', index=False)
english_df_nlc.to_csv('processed_data_english_no_lowercasing.csv', index=False)
dutch_df_nlc.to_csv('processed_data_dutch_no_lowercasing.csv', index=False)