In [182]:
import numpy as np
import pandas as pd
import nltk
nltk.download('wordnet', quiet=True)
import re

In [183]:
! pip install contractions
import contractions



## Read in the dataset and convert it to a pandas dataframe

In [184]:
file = open('../data/2_conversations.txt', 'r')
lines = file.readlines()

conversations = []
conversation = ""
for line in lines:
    if len(line.strip()) == 0:
        conversations.append(conversation)
        conversation = ""
    else:
        if line[:2] == "A:":
            conversation = conversation + line[2:]
        elif line[:2] == "B:":
            conversation = conversation + " " + line[3:] + " "

conversations.append(conversation)

In [185]:
df = pd.DataFrame()
df["conversation"] = conversations

In [186]:
df.to_csv('our_df.csv', index=False)

## Data Cleaning

### 1. Convert utterances to lowercase

In [187]:
df['conversation'] = df['conversation'].str.lower()

### 2. Remove any HTML or URLs from the utterances

In [188]:
df['conversation'] = df['conversation'].apply(lambda x: re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', str(x)))
df['conversation'] = df['conversation'].apply(lambda x: re.sub('<[^<]+?>', '', str(x)))

### 3. Expand contractions

In [189]:
df['conversation'] = df['conversation'].apply(lambda x: contractions.fix(x))

### 4. Tokenization

In [190]:
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt', quiet=True)

True

### 4.1. Sentence tokenization

In [191]:
df['conversation'] = df['conversation'].apply(lambda x: ' '.join([w for w in sent_tokenize(x)]))

### 4.2. Word tokenization

In [192]:
df['conversation'] = df['conversation'].apply(lambda x: ' '.join([w for w in word_tokenize(x)]))

### 5. Remove non-alphanumeric characters

In [193]:
df['conversation'] = df['conversation'].apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', str(x)))

# Pre-processing

In [194]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

### 1. Remove the stop words

In [195]:
stop_words = stopwords.words('english')
df['conversation'] = df['conversation'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop_words]))

### 2. Perform stemming

In [196]:
# stemmer = PorterStemmer()
# df['text'] = df['text'].apply(lambda x: ' '.join([stemmer.stem(w) for w in x.split()]))

### 3. Perform lemmatization

In [197]:
lemmatizer = nltk.WordNetLemmatizer()
df['conversation'] = df['conversation'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))

### 4. Remove extra whitespace

In [198]:
df['conversation'] = df['conversation'].apply(lambda x: ' '.join(x.split()))

# Write the preprocessed dataframe to a .csv file

In [199]:
df.to_csv('our_df_cleaned.csv', index=False)