In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')

In [2]:
import multiprocessing
import pandarallel
from pandarallel import pandarallel 

In [3]:
num_processors = multiprocessing.cpu_count()
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False, progress_bar=True)

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


## Data Loading

In [4]:
%%time
case = pd.read_csv('case.csv')
comm = pd.read_csv('communication.csv')
template = pd.read_csv('template.csv')



CPU times: user 35.6 s, sys: 3.42 s, total: 39 s
Wall time: 40.7 s


In [5]:
#drop the index column
comm.drop(columns = ['Unnamed: 0'], inplace = True)
template.drop(columns = ['Unnamed: 0'], inplace = True)

In [6]:
#rename columns
comm.columns = ['CaseID', 'CaseCommunicationId', 'Message', 'TemplateId']

In [7]:
print("CaseDetails:", case.shape)
print("Communications:", comm.shape)
print("Templates:", template.shape)

CaseDetails: (2193581, 4)
Communications: (3540555, 4)
Templates: (6081, 4)


In [8]:
case.head()

Unnamed: 0,CorpNo,CaseID,Title,Description
0,11918,3202528,Phone Call From,
1,11918,3202529,Phone Call From,
2,11918,3202530,Phone Call From,
3,11918,3202532,Phone Call From,"Hello, this is the same Thing. Why, why? Why I..."
4,11918,3202536,to check workfow,where is north hemisphere


In [9]:
comm.head()

Unnamed: 0,CaseID,CaseCommunicationId,Message,TemplateId
0,3202497.0,3807147.0,We cannot email transcripts to anyone because ...,
1,3202499.0,3803760.0,"Good Afternoon,<br><br>If you are referring to...",
2,3202500.0,3802660.0,Thank you for taking part in the conversation ...,
3,3202501.0,3849461.0,,
4,3202501.0,3802666.0,All your comment will be forwarded to Mr. Fane...,


In [10]:
template.head()

Unnamed: 0,TemplateId,Name,MainCorpNo,MessageBody
0,1,title,11918,[[Click here]]Login into account <br />\nhello...
1,3,"ipsum pulvinar sit amet. Nulla tortor augue, ...",11918,"<div id=""lipsum"">Login in to account [[Click h..."
2,4,Generated 150 paragraphsffffffffffffffffffffff...,11918,"<div id=""lipsum""><br />\nndimentum turpis urna..."
3,5,aretra eros a tempor volutpat. Fusce et neque ...,11918,"<div id=""lipsum""><br />\nndimentum turpis urna..."
4,6,new1,11918,hello<br />\nHello2


## Cleaning

### CaseDetails

In [11]:
case.shape

(2193581, 4)

#### clean "Title"
- drop rows coming from Phone Call
- drop testing data

In [12]:
title = pd.DataFrame(case['Title'].value_counts())
title

Unnamed: 0_level_0,count
Title,Unnamed: 1_level_1
Text Message From,81963
Chatbot,40202
Phone Call From,35257
Technology Services Support,21442
Enrollment,12082
...,...
Opt out of survey,1
Kurt Burkhardt,1
equity & inclusion survey,1
Sub for Clerical Aid,1


##### remove rows containing 'Phone Call From' and description is NaN


In [13]:
### replace NaN in Title with 'NaN'
case['Title'] = case['Title'].replace(np.NaN, 'NaN')

In [14]:
phone_call = case[(case['Title'].str.contains('Phone Call From', case=False)) & case['Description'].isna()]
print(phone_call.shape)
phone_call 

(5590, 4)


Unnamed: 0,CorpNo,CaseID,Title,Description
0,11918,3202528,Phone Call From,
1,11918,3202529,Phone Call From,
2,11918,3202530,Phone Call From,
65,11918,3226095,Phone Call From,
115,11918,3379069,Phone Call From,
...,...,...,...,...
2132474,47898,1019879,Phone Call From +13109557550,
2132527,47898,1123436,Phone Call From +19514501043,
2132528,47898,1141263,Phone Call From +14158526877,
2173623,56235,1416246,Phone Call From +13073213305,


In [21]:
case = case[~((case['Title'].str.contains('Phone Call From', case=False)) & case['Description'].isna())].reset_index(drop=True)

In [23]:
case.shape

(2187991, 4)

##### remove rows containing 'test landing page'

In [27]:
case = case[~case['Title'].str.contains('test landing page', case=False)].reset_index(drop=True)

In [28]:
# after filtering
case.shape

(2187631, 4)

#### clean "Description"
- drop NaN
- remove html
- remove email address
- remove website
- remove meaningless terms or characters
- drop rows with less than 5 words


In [29]:
case = case.dropna(subset=['Description'])
# after filtering out NaN in 'Description'
case.shape

(1846577, 4)

In [30]:
#define function to clean the text
def clean_str(raw_string):
    # remove HTML tag 
    #with regex 
    #cleaned_string = re.sub('<(?:[^>]+)>', '', raw_string)

    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(raw_string, 'html.parser')
    # Use the .text attribute to get the text content without HTML tags
    cleaned_string = soup.get_text()
    cleaned_string = cleaned_string.replace("©", '')

    # remove email address
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    cleaned_string = re.sub(email_pattern, '', cleaned_string)

    # remove url
    url_pattern = 'https://[\w\./]+'
    cleaned_string = re.sub(url_pattern, '', cleaned_string)

    # remove specific terms/characters
    cleaned_string = re.sub('(\?\ )?&nbsp;', '', cleaned_string)
    cleaned_string = re.sub('\s+', ' ', cleaned_string)  # Remove extra whitespace

    cleaned_string = cleaned_string.split('<', 1)[0]
    cleaned_string = cleaned_string.replace("?",'')

    #remove Help Center and other terms such as 'Privacy Policy', 'Terms & Conditions', 'Unsubscribe', etc. Allrightsreserved, All rights reserved.
    #ending_words = ['Help Center','Privacy Policy','Terms & Conditions','Unsubscribe','Allrightsreserved','All rights reserved.']
    #cleaned_string = cleaned_string.split('Help Center', 1)[0] #split the string at the first occurance of 'Help Center'

    return cleaned_string

In [36]:
%%time
# clean the 'Description' column
case['filtered_description'] = case['Description'].parallel_apply(clean_str)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=263797), Label(value='0 / 263797')…

CPU times: user 3.18 s, sys: 7.51 s, total: 10.7 s
Wall time: 1min 46s


In [37]:
ending_words = ['Help Center','Privacy Policy','Terms & Conditions','Unsubscribe','Allrightsreserved','All rights reserved.']

In [38]:
def extract_text(text):
    for ending_word in ending_words:
        # Find the index of the ending word in the article
        idx = text.find(ending_word)
        # If the ending word is found, delete the content after it
        if idx == 1:
            text = text[len(ending_word)+1:]
            idx = text.find(ending_word)
        if idx != -1:
            return text[:idx-1]
    return text

In [39]:
%%time
case['filtered_description2'] = case['filtered_description'].parallel_apply(extract_text)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=263797), Label(value='0 / 263797')…

CPU times: user 1.1 s, sys: 2.38 s, total: 3.48 s
Wall time: 4.56 s


In [40]:
#remove repetitive element in  "lp diaglogue talk.lp diaglogue talk." to make it become just "lp diaglogue talk."
def find_repetitive_substring(input_string):
    length = len(input_string)
    for i in range(1, length // 2 + 1):
        substring = input_string[:i]

        # Construct a potential repeated string
        repetitions = length // i
        repeated_substring = substring * repetitions

        # Check if the input string matches the repeated substring
        if repeated_substring == input_string:
            return substring

    return input_string

In [41]:
%%time
case['cleaned_description'] = case['filtered_description2'].parallel_apply(find_repetitive_substring)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=263797), Label(value='0 / 263797')…

CPU times: user 1.52 s, sys: 2.56 s, total: 4.09 s
Wall time: 24.2 s


In [42]:
#remove repetitive element in  "normal test normal test" to make it become just "normal test"
def find_repetitive_substring2(input_string):
    if not input_string.endswith(" "):
        input_string += " "
    length = len(input_string)
    for i in range(1, length // 2 + 1):
        substring = input_string[:i]

        # Construct a potential repeated string
        repetitions = length // i
        repeated_substring = substring * repetitions

        # Check if the input string matches the repeated substring
        if repeated_substring == input_string:
            return substring

    return input_string

In [43]:
%%time
case['cleaned_description'] = case['cleaned_description'].parallel_apply(find_repetitive_substring2)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=263797), Label(value='0 / 263797')…

CPU times: user 1.57 s, sys: 2.82 s, total: 4.39 s
Wall time: 25.1 s


In [44]:
case['cleaned_description'].head(50)

0     Hello, this is the same Thing. Why, why Why Is...
1                            where is north hemisphere 
2     Declination will allow of your district still ...
3                                         The the The. 
4     We found some Pins we think might be right up ...
5                                    Testing the work. 
6                            Lp dialogue Testing work. 
7                                      Add Description 
8     The new status page will replace all current s...
9     We didn't see any writing activity last week. ...
10                                   mohit test 1 test 
11                                     ¿Cómo te llamas 
12    From: Megha Test 06 July 2022 20:25To: Megha A...
13                                 What's on your mind 
14                                 Who is the she hulk 
15                            Report Bullying cautious 
16    We found some Pins we think might be right up ...
17                                              

In [46]:
# filter out rows with less than 5 words
case = case[case['cleaned_description'].apply(lambda x: len(x.split(' ')) >= 5)].reset_index(drop=True)

In [45]:
%%time
# clean the case titles with the same method
case['cleaned_title'] = case['Title'].parallel_apply(clean_str)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=263797), Label(value='0 / 263797')…

CPU times: user 621 ms, sys: 406 ms, total: 1.03 s
Wall time: 8.29 s


In [47]:
case.shape

(1770310, 8)

In [48]:
case.head(20)

Unnamed: 0,CorpNo,CaseID,Title,Description,filtered_description,filtered_description2,cleaned_description,cleaned_title
0,11918,3202532,Phone Call From,"Hello, this is the same Thing. Why, why? Why I...","Hello, this is the same Thing. Why, why Why Is...","Hello, this is the same Thing. Why, why Why Is...","Hello, this is the same Thing. Why, why Why Is...",Phone Call From
1,11918,3202536,to check workfow,where is north hemisphere,where is north hemisphere,where is north hemisphere,where is north hemisphere,to check workfow
2,11918,3202537,Phone Call From,Declination will allow of your district still ...,Declination will allow of your district still ...,Declination will allow of your district still ...,Declination will allow of your district still ...,Phone Call From
3,11918,3202549,?? 17 Hairstyles Pins you might like,"<html xmlns="" xmlns:o=""urn:schemas-microsoft-c...",We found some Pins we think might be right up ...,We found some Pins we think might be right up ...,We found some Pins we think might be right up ...,17 Hairstyles Pins you might like
4,11918,3202685,Lp dialogue Testing work.,Lp dialogue Testing work.<br /><br /><br />Lp ...,Lp dialogue Testing work.Lp dialogue Testing w...,Lp dialogue Testing work.Lp dialogue Testing w...,Lp dialogue Testing work.,Lp dialogue Testing work.
5,11918,3206034,REMINDER: Important update from Vonage: API St...,"<html xmlns="" xmlns:v=""urn:schemas-microsoft-c...",The new status page will replace all current s...,The new status page will replace all current s...,The new status page will replace all current s...,REMINDER: Important update from Vonage: API St...
6,11918,3206178,We miss you! Make sure you're logged in.,"<html lang=""en"" xmlns:v=""urn:schemas-microsoft...",We didn't see any writing activity last week. ...,We didn't see any writing activity last week. ...,We didn't see any writing activity last week. ...,We miss you! Make sure you're logged in.
7,11918,3207195,mohit test 1 test,mohit test 1 test<br /><br />mohit test 1 test...,mohit test 1 testmohit test 1 testmohit test 1...,mohit test 1 testmohit test 1 testmohit test 1...,mohit test 1 test,mohit test 1 test
8,11918,3207818,chained email dialogue,"<html><body dir=""ltr""><div style=""font-family:...",From: Megha Test 06 July 2022 20:25To: Megha A...,From: Megha Test 06 July 2022 20:25To: Megha A...,From: Megha Test 06 July 2022 20:25To: Megha A...,chained email dialogue
9,11918,3209991,What's on your mind?,What's on your mind?<br /><br />What's on your...,What's on your mindWhat's on your mind,What's on your mindWhat's on your mind,What's on your mind,What's on your mind


In [49]:
case_cleaned = case[['CorpNo','CaseID','cleaned_title','cleaned_description']]

In [50]:
case_cleaned = case_cleaned[~case_cleaned['cleaned_description'].str.match(r'^[ .]*$')] #remove rows only has characters

In [51]:
def remove_emoji(text):
    emoji_pattern = re.compile(r'[\ud800-\udbff][\udc00-\udfff]|[\u2000-\u2BFF\u2600-\u26FF\u2700-\u27BF\ud83c][\udc00-\udfff]')

    # Remove emojis and the problematic character
    text_without_emojis = emoji_pattern.sub('', text)

    return text_without_emojis

In [52]:
%%time 
case_cleaned['cleaned_description'] = case_cleaned['cleaned_description'].parallel_apply(remove_emoji)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=252896), Label(value='0 / 252896')…

CPU times: user 1.32 s, sys: 3.52 s, total: 4.83 s
Wall time: 9.16 s


In [53]:
case_cleaned.tail(30)

Unnamed: 0,CorpNo,CaseID,cleaned_title,cleaned_description
1770280,58018,1466858,Masks,Are we legally able to be fined for not wearin...
1770281,58018,1468714,RFP 2018-03 Avid Contract,"Good Afternoon, Can you please send me the con..."
1770282,58018,1468907,Calendar,I know school just ended but I’m wondering abo...
1770283,58018,1469716,Kathy Miller,"Good Evening,The counseling assistant at the J..."
1770284,58018,1467238,Monday Diploma pick up,"Hello, I had messaged a couple weeks ago about..."
1770285,58018,1470751,Copy of my child's transcripts,"Hello,I need to get a copy of my daughters tra..."
1770286,58018,1472386,Strength and Conditioning,Good morning. My son will be an incoming 9th g...
1770287,58018,1473111,Chromebook,Where do i turn in my chromebook
1770288,58018,1473406,,My name is Toshiba Chapman and i paid for my s...
1770289,58018,1473743,2021 Senior yearbook pictures,"Hello, First time Senior mom here,I was just w..."


In [54]:
case_cleaned['cleaned_description'][830597]

'Can you tell me wo is doing the central office parking lot waiting list pleaseThank youElena Elena Runco, LCSWPittsburgh Public SchoolsDirector of Student Support Services-Social Workers341 South Bellefield Ave.Pittsburgh, PA 15213 From: Runco, Elena Sent: Monday, December 20, 2021 7:40 AMTo: Support parking waiting list Good morning,Can you please tell me who is doing the parking waiting list at central officeThank you!Elena Elena Runco, LCSWPittsburgh Public SchoolsDirector of Student Support Services-Social Workers341 South Bellefield Ave.Pittsburgh, PA 15213 '

In [None]:
district_w_template = cleaned_template['MainCorpNo'].unique()

In [None]:
# filter out cases/communications from districts that don't have any templates 
case_cleaned = case_cleaned[case_cleaned['CorpNo'].apply(lambda x: True if x in district_w_template else False)]
print("case_cleaned:", case_cleaned.shape)
case_cleaned.head()

In [55]:
case_cleaned.to_csv('case_clean.csv')

In [56]:
case_cleaned.shape

(1770268, 4)

### Communication ('Message' column)
- drop NaN
- remove html
- remove email address
- remove website
- drop rows with less than 5 words

In [57]:
# drop NaN
comm = comm.dropna(subset=['Message'])
# after filtering out NaN in 'Message'
comm.shape

(2624538, 4)

In [58]:
%%time
# clean the 'Message' column
comm['filtered_Message'] = comm['Message'].parallel_apply(clean_str)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=374934), Label(value='0 / 374934')…

CPU times: user 6.33 s, sys: 9.24 s, total: 15.6 s
Wall time: 3min 27s


In [59]:
comm['filtered_Message'].tail(30)

3540502    Good evening. I just want to follow up with my...
3540504    Good morning, Thank you for your patience as o...
3540505                                                   10
3540513    I hope I'm sending this correctly. It's hard d...
3540517    Good afternoon,The website allowed me to selec...
3540519                     I’m pleased my comment was read.
3540522    Thank you for reaching out. We sincerely apolo...
3540523    Thank you for reaching out. We sincerely apolo...
3540524    Good morning, Just following up on your reques...
3540525    Thank you for contacting us with your question...
3540526    Please send your proofs of residency to />Plea...
3540527    Hi Kristine, Thank you for attending the Board...
3540528    Thank you for reaching out to Alvin ISD. We ar...
3540529    Buenas tardes y le pedimos una disculpa por la...
3540530    Hi Richard,I will need your Tulane email in or...
3540531    Thank you for your dialogue about TVUSD’s reop...
3540532    Good afternoo

In [60]:
# filter out rows with less than 5 words
comm_cleaned = comm[comm['filtered_Message'].apply(lambda x: len(x.split(' ')) >= 5)].reset_index(drop=True)

In [66]:
comm_cleaned.head(20)

Unnamed: 0,CaseID,CaseCommunicationId,TemplateId,cleaned_message
0,3202497.0,3807147.0,,We cannot email transcripts to anyone because ...
1,3202499.0,3803760.0,,"Good Afternoon,If you are referring to the Ret..."
2,3202500.0,3802660.0,,Thank you for taking part in the conversation ...
3,3202501.0,3802666.0,,All your comment will be forwarded to Mr. Fane...
4,3202503.0,3802273.0,,"Good morning,Thank you for reaching out regard..."
5,3202505.0,3800378.0,,"Dear Nicholas Holstein,Thank you for your inte..."
6,3202510.0,3806611.0,,"9th grade 444742Sent from my iPhone> On Jul 5,..."
7,3202520.0,3800374.0,,Amazing feedback and fast response
8,3202522.0,3851644.0,,Took too long to respond.
9,3202531.0,3811002.0,,Good morning. Give me a call at so we can disc...


In [62]:
comm_cleaned['cleaned_message'] = comm_cleaned['filtered_Message'].parallel_apply(remove_emoji)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=354669), Label(value='0 / 354669')…

In [63]:
comm_cleaned = comm_cleaned[['CaseID', 'CaseCommunicationId','TemplateId','cleaned_message']]

In [64]:
comm_cleaned.shape

(2482683, 4)

In [65]:
comm_cleaned.to_csv('comm_clean.csv')

### Templates ('MessageBody' column)
- drop NaN
- remove html
- remove email address
- remove website
- drop rows with less than 5 words

In [67]:
# drop NaN
template = template.dropna(subset=['MessageBody'])
# after filtering out NaN in 'Description'
template.shape

(6081, 4)

In [68]:
# clean the 'MessageBody' column
template['cleaned_MessageBody'] = template['MessageBody'].apply(clean_str)

In [69]:
# filter out rows with less than 5 words
cleaned_template = template[template['cleaned_MessageBody'].apply(lambda x: len(x.split(' ')) >= 5)].reset_index(drop=True)

In [70]:
cleaned_template.shape

(5971, 5)

In [71]:
cleaned_template.head(30)

Unnamed: 0,TemplateId,Name,MainCorpNo,MessageBody,cleaned_MessageBody
0,1,title,11918,[[Click here]]Login into account <br />\nhello...,[[Click here]]Login into account hello whats u...
1,3,"ipsum pulvinar sit amet. Nulla tortor augue, ...",11918,"<div id=""lipsum"">Login in to account [[Click h...",Login in to account [[Click here]] Sed consect...
2,4,Generated 150 paragraphsffffffffffffffffffffff...,11918,"<div id=""lipsum""><br />\nndimentum turpis urna...","ndimentum turpis urna, eu facilisis leo rutru..."
3,5,aretra eros a tempor volutpat. Fusce et neque ...,11918,"<div id=""lipsum""><br />\nndimentum turpis urna...","ndimentum turpis urna, eu facilisis leo rutru..."
4,35,Waiting on information from customer,12313,"Hello,<br />\n<br />\nI am writing to follow u...","Hello, I am writing to follow up on your recen..."
5,36,Standard Opening and Close,12313,Thank you for contacting the Santa Ana Unified...,Thank you for contacting the Santa Ana Unified...
6,60,Inclement Weather Closure,13454,"Dear Parent,<br />\n<br />\nThe district will ...","Dear Parent, The district will be closed today..."
7,61,Giberson Start of School,13454,Hello! &nbsp;School starts on August 18th. &nb...,Hello! School starts on August 18th. We have o...
8,63,Thank You for your Feedback,13489,"Dear [[Customer’s Name]],<br />\n<br />\nThank...","Dear [[Customer’s Name]], Thank you for taking..."
9,2,Generated 5 paragraphs,11918,"<div id=""lipsum"">\n<p>[[Contact Date]][[Dialog...",[[Contact Date]][[Dialogue Number]][[Submissi...


In [72]:
cleaned_template = cleaned_template.drop(['MessageBody'],axis=1)
cleaned_template.head(50)

Unnamed: 0,TemplateId,Name,MainCorpNo,cleaned_MessageBody
0,1,title,11918,[[Click here]]Login into account hello whats u...
1,3,"ipsum pulvinar sit amet. Nulla tortor augue, ...",11918,Login in to account [[Click here]] Sed consect...
2,4,Generated 150 paragraphsffffffffffffffffffffff...,11918,"ndimentum turpis urna, eu facilisis leo rutru..."
3,5,aretra eros a tempor volutpat. Fusce et neque ...,11918,"ndimentum turpis urna, eu facilisis leo rutru..."
4,35,Waiting on information from customer,12313,"Hello, I am writing to follow up on your recen..."
5,36,Standard Opening and Close,12313,Thank you for contacting the Santa Ana Unified...
6,60,Inclement Weather Closure,13454,"Dear Parent, The district will be closed today..."
7,61,Giberson Start of School,13454,Hello! School starts on August 18th. We have o...
8,63,Thank You for your Feedback,13489,"Dear [[Customer’s Name]], Thank you for taking..."
9,2,Generated 5 paragraphs,11918,[[Contact Date]][[Dialogue Number]][[Submissi...


In [73]:
cleaned_template.to_csv('template_clean.csv')

## Merging

In [31]:
case_cleaned = pd.read_csv('case_clean.csv', index_col=['Unnamed: 0'])
comm_cleaned = pd.read_csv('comm_clean.csv', index_col=['Unnamed: 0'])
cleaned_template = pd.read_csv('template_clean.csv', index_col=['Unnamed: 0'])

In [30]:
case_cleaned.shape

(1699508, 4)

In [34]:
#merge dataset
full_df_clean = case_cleaned.merge(comm_cleaned, on = 'CaseID', how = 'inner').merge(cleaned_template, on = 'TemplateId', how = 'left')
print(full_df_clean.shape)
full_df_clean.head(3)

(2143009, 10)


Unnamed: 0,CorpNo,CaseID,cleaned_title,cleaned_description,CaseCommunicationId,TemplateId,cleaned_message,Name,MainCorpNo,cleaned_MessageBody
0,11918,3332419,New login to Twitter from Chrome on Windows,We noticed a login to your account New login ...,3950727.0,,TwitterWe noticed a login to your account @Pru...,,,
1,11918,3264126,Security alert for,This is a copy of a security alert sent to If ...,3899409.0,,This is a copy of a security alert sent to is ...,,,
2,11918,3264126,Security alert for,This is a copy of a security alert sent to If ...,3900941.0,,This is a copy of a security alert sent to is ...,,,


In [35]:
full_df_clean.to_csv('full_df_w_no_template.csv', index=False)

In [78]:
full_df_clean['cleaned_description'][0]

"Parker Wix's chromebook will not turn on without his charger being plugged in. It will not hold a charge. "

In [79]:
full_df_clean['cleaned_message'][0]

'Jaclyn Nicole Wix If you submit a work order to the technology help desk, that will be the quickest way to get your problem resolved. To do that now, click HERE. I hope you have a blessed day! Jamie Hubbell Wilson County Schools '

In [80]:
full_df_clean['cleaned_MessageBody'][0]

'[[Customer’s Name]] If you submit a work order to the technology help desk, that will be the quickest way to get your problem resolved. To do that now, click HERE'

In [82]:
full_df_clean[full_df_clean['CorpNo'] != full_df_clean['MainCorpNo']].head(10)

Unnamed: 0,CorpNo,CaseID,cleaned_title,cleaned_description,CaseCommunicationId,TemplateId,cleaned_message,Name,MainCorpNo,cleaned_MessageBody
7069,11918,3641375,Phone Call From +17064578206,Test for Phone Dialogue Test,4243966.0,695.0,"Hello Prutha test,This template is made by geo...",Bus Safety Response,32114,Thank you for taking the time to send us the ...
54650,65469,1994514,Phone Call From +14016485556,Good afternoon this is the guardian of Ariana'...,2489884.0,742.0,"Hello, We are sending this message because it ...",PreK Registration for 18'-19' calendar year,31460,[[Customer’s Name]] Thanks for reaching out to...
54651,65469,1993735,Phone Call From +14014299544,My name is Christine hopper I was calling beca...,2487129.0,742.0,"Hello, We are sending this message because it ...",PreK Registration for 18'-19' calendar year,31460,[[Customer’s Name]] Thanks for reaching out to...
54652,65469,1993737,Phone Call From +14014810164,I mean I'm of lovely Pereira's I have a son th...,2487128.0,742.0,"Hello, We are sending this message because it ...",PreK Registration for 18'-19' calendar year,31460,[[Customer’s Name]] Thanks for reaching out to...
54653,65469,1993738,Phone Call From +14013169333,Hi My name is Kenny and Richard's my phone num...,2487127.0,742.0,"Hello, We are sending this message because it ...",PreK Registration for 18'-19' calendar year,31460,[[Customer’s Name]] Thanks for reaching out to...
54654,65469,1993739,Phone Call From +18605188382,I calling about the survey 4 seconds after sch...,2487126.0,742.0,"Hello, We are sending this message because it ...",PreK Registration for 18'-19' calendar year,31460,[[Customer’s Name]] Thanks for reaching out to...
54655,65469,1993740,Phone Call From +14016444544,Yes hi Bill My name is Joshua hell Ariel. I ha...,2487125.0,742.0,"Hello, We are sending this message because it ...",PreK Registration for 18'-19' calendar year,31460,[[Customer’s Name]] Thanks for reaching out to...
54656,65469,1993741,Phone Call From +14019411372,My name is Francis burgles you can reach me at...,2487124.0,742.0,"Hello, We are sending this message because it ...",PreK Registration for 18'-19' calendar year,31460,[[Customer’s Name]] Thanks for reaching out to...
54657,65469,1993931,Phone Call From +14015277748,Hi My name is Kim McCall if you can call me up...,2489473.0,742.0,"Hello, We are sending this message because it ...",PreK Registration for 18'-19' calendar year,31460,[[Customer’s Name]] Thanks for reaching out to...
54658,65469,1993931,Phone Call From +14015277748,Hi My name is Kim McCall if you can call me up...,2489663.0,742.0,"Hello, We are sending this message because it ...",PreK Registration for 18'-19' calendar year,31460,[[Customer’s Name]] Thanks for reaching out to...


In [84]:
# filtering out rows where the users didn't make a response ('CaseCommunicationId' == NaN)
full_df_clean = full_df_clean.dropna(subset=['CaseCommunicationId'])
print(full_df_clean.shape)

(67018, 10)
