In [1]:
#Libraries

import pandas as pd
import numpy as np
import re
import datetime


In [2]:
#Ingest Libraryh3lp chat transcripts

transcript_2019 = pd.read_csv("C:\\Users\\kcalvert\\Documents\\Data files\\Chat\\2019-2020-2021-metadata-and-transcripts\\2019-metadata-and-transcripts.csv")
transcript_2020 = pd.read_csv("C:\\Users\\kcalvert\\Documents\\Data files\\Chat\\2019-2020-2021-metadata-and-transcripts\\2020-metadata-and-transcripts.csv")
transcript_2021 = pd.read_csv("C:\\Users\\kcalvert\\Documents\\Data files\\Chat\\2019-2020-2021-metadata-and-transcripts\\2021-metadata-and-transcripts.csv")
transcript_2022 = pd.read_csv("C:\\Users\\kcalvert\\Documents\\Data files\\Chat\\2019-2020-2021-metadata-and-transcripts\\2022-metadata-and-transcripts.csv")
transcript_2023 = pd.read_csv("C:\\Users\\kcalvert\\Documents\\Data files\\Chat\\2019-2020-2021-metadata-and-transcripts\\2023-metadata-and-transcripts.csv")

#Combine files
chats = pd.concat([transcript_2019,transcript_2020,transcript_2021,transcript_2022,transcript_2023],sort = True)

In [3]:
#Drop reference desk call button entries

chats = chats[chats["profile"] != "refdeskbutton"]

In [4]:
#Create dataframe without identifiers

anon = chats.drop(columns=['guest','queue','profile','operator','ip','tags'])

#Create functions to remove pii: email addresses, phone numbers, student ids, ip addresses
def drop_email(data, default ="email_address_removed"):
    data = re.sub('([\w\.\-]+)@([\w\-]+)((\.(\w){2,63}){1,3})',default, str(data))
    return data

def drop_phone(data, default = "phone_number"):
    data = re.sub('(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?',default,str(data))
    return data

def drop_ip(data, default = "ip_address"):
    data = re.sub('(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])',default,str(data))
    return data

def drop_920(data, default= 'student_id'):
    data = re.sub('920[0-9]{6}',default,str(data))
    return data


#Functions to strip out the timestamp and user information from the text of the chat 
#e.g. Original 19:22PM email_address_removed/2ugkwfef2ar45t: Gotcha! Is there anything else I can help you with? 
#e.g. Original 7:26PM kgp25xjqxn8n6g@web.libraryh3lp.com: Can the public use the campus library?
#e.g. Edited Gotcha! Is there anything else I can help you with?

def strip_chat_time_user(data, default=""):
    data = re.sub('[0-9]{1,2}:[0-9]{2}[A-z]{2}.*\:\s',default,str(data))
    return data

#Functions to replace URLs, emoji, repeated punctuation, and contractions
def drop_url(data,default="URL"):
    data = re.sub('(http|https):\/\/\S+',default,str(data))
    return data

def word_repetition(data):
    data = re.sub(r'(.)\1+', r'\1\1',str(data))
    return data

def drop_pii(data):
    data = drop_email(data)
    data = drop_920(data)
    data = drop_phone(data)
    data = drop_ip(data)
    return data

def clean_chat(data):
    data = drop_email(data)
    data = drop_920(data)
    data = drop_phone(data)
    data = drop_ip(data)
    data = strip_chat_time_user(data)
    data = drop_pii(data)
    data = drop_url(data)
    data = str(data).lower()
    data = re.sub('\r\n',' ',data)
    return data

In [5]:
#Clean data
anon["clean_text"] = anon["text"].apply(clean_chat)

In [6]:
anon.to_csv('sedlc_cleaned_chats.csv')