# Whatsapp Export

In [80]:
# import packages
import pandas as pd
import numpy as np

In [81]:
# CONFIGURATION
# Names list - CHANGE THESE to include any personal names to remove.
names = { 
            'old_name' : 'new_name',
            'Joe' : 'J',
            'Dom' : 'D', 
}

# Censor list - CHANGE THESE to include any words you want to hide, I'd suggest including the above names.
censor_list = [
    ['word_to_be_censored','censored_word'],
    ['best','good'],
    ['amazing','alright'],
]

In [82]:
# Read in CSV
df = pd.read_csv('parsed_export.csv',sep="|")
# convert to datetime
df['datetime']=pd.to_datetime(df['datetime'])
df.head()

Unnamed: 0,datetime,name,message
0,2019-02-21 09:09:00,Joe,Wow what an amazing chat!
1,2019-02-21 09:10:00,Dom,"I agree, it is amazing!"
2,2019-02-21 09:12:00,Joe,You're the best!
3,2019-02-21 09:12:00,Dom,"No, you are the best!"
4,2019-02-21 09:12:00,Dom,<Media omitted>


# Censoring
In the interest of privacy I would suggest removing any personal details. The below code allows you to censor names, locations, ideas and whatever else you like.

In [83]:
# Censor name and words

# Replace messages
def censor(message):
    for censor in censor_list:
        message = message.replace(censor[0],censor[1])
    
    return message

# remove any spaces
df['name'] = df['name'].str.strip()
# Replace names
df = df.replace(
    {
        'name' : names
    }
)

# censor all messages
df['message'] = df['message'].apply(censor)
df.head()

Unnamed: 0,datetime,name,message
0,2019-02-21 09:09:00,J,Wow what an alright chat!
1,2019-02-21 09:10:00,D,"I agree, it is alright!"
2,2019-02-21 09:12:00,J,You're the good!
3,2019-02-21 09:12:00,D,"No, you are the good!"
4,2019-02-21 09:12:00,D,<Media omitted>


# Gotchas
Every Media message has been replaced with "<Media Omitted>", we need to remove these or they might be our most popular words.

Capitalized words will show up twice too, so "Joe" and "joe" might both show up in our word cloud. To avoid this we can capitalize, or lowercase everything.

In [84]:
# Drop rows with media messages
df = df[df['message'] != '<Media omitted>']

# Capitalize everything
df['message'] = df['message'].apply(lambda x: x.title())
df.head()

Unnamed: 0,datetime,name,message
0,2019-02-21 09:09:00,J,Wow What An Alright Chat!
1,2019-02-21 09:10:00,D,"I Agree, It Is Alright!"
2,2019-02-21 09:12:00,J,You'Re The Good!
3,2019-02-21 09:12:00,D,"No, You Are The Good!"


# Stop words
If we now look at our most common words we have what are known as "stop words" in NLP. These are commonly used words that are considered unimportant to the content of a message. Think words like "and, to,I". These are considered important in a lot of Natural Language Processing now, but they make for rubbish word clouds. NLTK can help us here.

In [85]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

def remove_stopwords(message):
  text_tokens = word_tokenize(message)
  # remove stopwrods and punctuation
  # all words need to be uppercased
  return ' '.join([word.capitalize() for word in text_tokens if not word in stopwords.words() if word.isalnum()])

remove_stopwords("You're can get now just like , one take back please joe")

[nltk_data] Downloading package punkt to /Users/joern/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/joern/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'You Get Like Back Please Joe'

In [86]:
# Remove the stopwords from our dataframe
df['message_without_stopwords'] = df['message'].apply(remove_stopwords)
df.head()

Unnamed: 0,datetime,name,message,message_without_stopwords
0,2019-02-21 09:09:00,J,Wow What An Alright Chat!,Wow What An Alright Chat
1,2019-02-21 09:10:00,D,"I Agree, It Is Alright!",I Agree It Is Alright
2,2019-02-21 09:12:00,J,You'Re The Good!,The Good
3,2019-02-21 09:12:00,D,"No, You Are The Good!",No You Are The Good


In [87]:
# Check out this great word cloud site. This actually does a lot of the work for you in removing stop words, and doing the frequency count. In order to use this we need to get an output file of just our text messages. 
df['message_without_stopwords'].to_csv('output/Formatted_messages.csv',index=False)
# Maybe we also want to make individual word clouds per "friend"

In [79]:
# Or generate a file per user
for name in names.values():
    df[df.name == name]['message_without_stopwords'].to_csv('output/' + name +'_messages.csv', index=False,header=None)

# WordCloud time
This is everything you need for my better version of a wordcloud.