# Whatsapp Export

In [12]:
# import packages
import pandas as pd
import numpy as np

In [13]:
# CONFIGURATION
# Names list - CHANGE THESE to include any personal names to remove.
names = { 
            'old_name' : 'new_name',
            'Joe' : 'J',
            'Dom' : 'D', 
}

# Censor list - CHANGE THESE to include any words you want to hide, I'd suggest including the above names.
censor_list = [
    ['word_to_be_censored','censored_word'],
    ['Joe','J'],
    ['Dom','D'],
]

In [15]:
# Read in CSV
df = pd.read_csv('output.csv',sep="|")
# convert to datetime
df['datetime']=pd.to_datetime(df['datetime'])
df.head()

Unnamed: 0,datetime,name,message
0,2019-02-21 09:09:00,Joe,Hey guys this is it
1,2019-02-21 09:10:00,Dom,I’m there
2,2019-02-21 09:12:00,Joe,You're the content kid
3,2019-02-21 09:12:00,Dom,No you are a content kid
4,2019-02-21 09:12:00,Dom,Haha just joking but that’s the kind of banter...


# Censoring
In the interest of privacy I would suggest removing any personal details. The below code allows you to censor names, locations, ideas and whatever else you like.

In [16]:
# Censor name and words

# Replace messages
def censor(message):
    message = str(message)
    for censor in censor_list:
        message = message.replace(censor[0],censor[1])
    
    return message

# remove any spaces
df['name'] = df['name'].str.strip()
# Replace names
df = df.replace(
    {
        'name' : names
    }
)

# censor all messages
df['message'] = df['message'].apply(censor)
df.head()

Unnamed: 0,datetime,name,message
0,2019-02-21 09:09:00,J,Hey guys this is it
1,2019-02-21 09:10:00,D,I’m there
2,2019-02-21 09:12:00,J,You're the content kid
3,2019-02-21 09:12:00,D,No you are a content kid
4,2019-02-21 09:12:00,D,Haha just joking but that’s the kind of banter...


# Gotchas
Every Media message has been replaced with "<Media Omitted>", we need to remove these or they might be our most popular words.

Capitalized words will show up twice too, so "Joe" and "joe" might both show up in our word cloud. To avoid this we can capitalize, or lowercase everything.

In [17]:
# Drop rows with media messages
df = df[df['message'] != '<Media omitted>']

# Capitalize everything
df['message'] = df['message'].apply(lambda x: x.title())
df.head()

Unnamed: 0,datetime,name,message
0,2019-02-21 09:09:00,J,Hey Guys This Is It
1,2019-02-21 09:10:00,D,I’M There
2,2019-02-21 09:12:00,J,You'Re The Content Kid
3,2019-02-21 09:12:00,D,No You Are A Content Kid
4,2019-02-21 09:12:00,D,Haha Just Joking But That’S The Kind Of Banter...


# Stop words
If we now look at our most common words we have what are known as "stop words" in NLP. These are commonly used words that are considered unimportant to the content of a message. Think words like "and, to,I". These are considered important in a lot of Natural Language Processing now, but they make for rubbish word clouds. NLTK can help us here.

In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

def remove_stopwords(message):
  text_tokens = word_tokenize(message)
  # remove stopwrods and punctuation
  # all words need to be uppercased
  return ' '.join([word.capitalize() for word in text_tokens if not word in stopwords.words() if word.isalnum()])

remove_stopwords("You're can get now just like , one take back please joe")

[nltk_data] Downloading package punkt to /Users/joern/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/joern/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'You Get Like Back Please Joe'

In [19]:
df.head()

Unnamed: 0,datetime,name,message
0,2019-02-21 09:09:00,J,Hey Guys This Is It
1,2019-02-21 09:10:00,D,I’M There
2,2019-02-21 09:12:00,J,You'Re The Content Kid
3,2019-02-21 09:12:00,D,No You Are A Content Kid
4,2019-02-21 09:12:00,D,Haha Just Joking But That’S The Kind Of Banter...


In [None]:
df[df.name=='Andy Campbell'].message.str.split(expand=True).stack().value_counts(100)

In [None]:
# Remove the stopwords from our dataframe
df['message_without_stopwords'] = df['message'].apply(remove_stopwords)
df.head()

In [14]:
# Check out this great word cloud site. This actually does a lot of the work for you in removing stop words, and doing the frequency count. In order to use this we need to get an output file of just our text messages. 
df['message_without_stopwords'].to_csv('output/Formatted_messages.csv',index=False)
# Maybe we also want to make individual word clouds per "friend"

  


In [15]:
# Or generate a file per user
for name in names.values():
    df[df.name == name]['message_without_stopwords'].to_csv('output/' + name +'_messages.csv', index=False,header=None)

  This is separate from the ipykernel package so we can avoid doing imports until


# WordCloud time
This is everything you need for my better version of a wordcloud.