In [170]:
import numpy as np
import pandas as pd
import nltk
nltk.download('wordnet', quiet=True)
import re

In [171]:
! pip install contractions
import contractions




[notice] A new release of pip available: 22.2.2 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





## Read in the datasets

In [172]:
'''
# The following reads in the full_info.csv file

info_cols = ['dialogue_id', 'uid', 'role', 'donation', 'num_turns', 'extrovert', 'agreeable', \
    'conscientious', 'neurotic', 'open', 'care', 'fairness', 'loyalty', 'authority', 'purity', 'freedom', \
    'conform', 'tradition', 'benevolence', 'universalism', 'self_direction', 'stimulation', 'hedonism', \
    'achievement', 'power', 'security', 'rational', 'intuitive', 'age', 'sex', 'race', 'education', 'marital', \
    'employment', 'income', 'religion', 'ideology']
info_df = pd.read_csv('./data/FullData/full_info.csv', names=info_cols, header=None, skiprows=1)
'''

# The following reads in the 300_info.csv file
info_cols = ['dialogue_id', 'uid', 'role', 'intended_donation', 'actual_donation', 'num_turns']
info_df = pd.read_csv('./data/AnnotatedData/300_info.csv', names=info_cols, header=None, skiprows=1)

In [173]:
'''
# The following reads in the full_dialog.csv file

dialogue_cols = ['sentence_id', 'sentence', 'turn', 'role', 'dialogue_id']
dialogue_df = pd.read_csv('./data/FullData/full_dialog.csv', names=dialogue_cols, \
    header=None, skiprows=1, skipinitialspace=True)
'''

# The following reads in the 300_dialog.csv file
dialogue_cols = ['idx', 'dialogue_id', 'role', 'turn', 'sentence', 'er_label_1', 'ee_label_1', 'er_label_2', 'ee_label_2', 'negative', 'neutral', 'positive']
dialogue_df = pd.read_csv('./data/AnnotatedData/300_dialog.csv', names=dialogue_cols, \
    header=None, skiprows=1, skipinitialspace=True, encoding='unicode_escape')

# Data cleaning

### 1. Reformat UIDs by removing the "user_" suffix and converting to np.int16

In [174]:
info_df['uid'] = info_df['uid'].str[5:].astype(np.int16)

### 2. Convert utterances to lowercase

In [175]:
dialogue_df['sentence'] = dialogue_df['sentence'].str.lower()

### 3. Remove any HTML or URLs from the utterances

In [176]:
dialogue_df["sentence"] = dialogue_df["sentence"].apply(lambda x: re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', str(x)))
dialogue_df["sentence"] = dialogue_df["sentence"].apply(lambda x: re.sub('<[^<]+?>', '', str(x)))

### 4. Expand contractions

In [177]:
dialogue_df["sentence"] = dialogue_df["sentence"].apply(lambda x: contractions.fix(x))

### 5. Tokenization

In [178]:
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt', quiet=True)

True

### 5.1. Sentence tokenization

In [179]:
dialogue_df["sentence"] = dialogue_df["sentence"].apply(lambda x: ' '.join([w for w in sent_tokenize(x)]))

### 5.2. Word tokenization

In [180]:
dialogue_df["sentence"] = dialogue_df["sentence"].apply(lambda x: ' '.join([w for w in word_tokenize(x)]))

### 6. Remove non-alphanumeric characters

In [181]:
dialogue_df["sentence"] = dialogue_df["sentence"].apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', str(x)))

# Pre-processing

In [182]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

### 1. Remove the stop words

In [183]:
# stop_words = stopwords.words('english')
# dialogue_df['sentence'] = dialogue_df['sentence'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop_words]))

### 2. Perform stemming

In [184]:
stemmer = PorterStemmer()
dialogue_df['sentence'] = dialogue_df['sentence'].apply(lambda x: ' '.join([stemmer.stem(w) for w in x.split()]))

### 3. Perform lemmatization

In [185]:
lemmatizer = nltk.WordNetLemmatizer()
dialogue_df['sentence'] = dialogue_df['sentence'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))

### 4. Remove extra whitespace

In [186]:
dialogue_df['sentence'] = dialogue_df['sentence'].apply(lambda x: ' '.join(x.split()))

### 5. Remove NaN utterances

In [187]:
dialogue_df['sentence'].replace('', np.nan, inplace=True)
dialogue_df.dropna(subset=['sentence'], inplace=True)

# Write the preprocessed dataframes to .csv files

In [188]:
dialogue_df.to_csv('dialogue_df_cleaned.csv', index=False)
info_df.to_csv('info_df_cleaned.csv', index=False)