In [8]:
import pandas as pd
import numpy as np
import zipfile
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import datetime


%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 9)

# Importing Data From JSON

In [9]:
import json
from pandas.io.json import json_normalize

# raw_data = pd.read_json("json/data.json", orient="records") ; works but fewer columns

# Normalizing columns lets us get info from sub-arrays like 'share.link' and 'share.share_text'
with open('res/json/feb-16-2019.json') as data_file:    
    raw_data = json_normalize(json.load(data_file))

raw_data.columns

Index(['audio_files', 'call_duration', 'content', 'files', 'gifs', 'missed',
       'payment_info.amount', 'payment_info.completedTime',
       'payment_info.creationTime', 'payment_info.currency',
       'payment_info.receiverName', 'payment_info.senderName', 'photos',
       'plan.timestamp', 'plan.title', 'reactions', 'sender_name',
       'share.link', 'share.share_text', 'sticker.uri', 'timestamp_ms', 'type',
       'videos'],
      dtype='object')

In [10]:
raw_data['sender_name'].value_counts()

Joyce Luong     19185
Andrew Huang    15920
Name: sender_name, dtype: int64

In [11]:
def decode_emoji(emoji):
    # UTF8 emojis were incorrectly exported as Latin-1.
    return emoji.encode('latin1').decode('utf8')

def parse_reactions(r): 
    """ Parses and returns the reactions for a conversation between exactly two people.
    
    INPUT:  [{'reaction': 'ð\x9f\x91\x8e', 'actor': 'Name1'},
            {'reaction': 'ð\x9f\x98¢', 'actor': 'Name2'}]
            
    OUTPUT: "Name1", 😮, "Name2", 😍
    """
    
    if type(r) != list:
        return None, None, None, None
    
    elif len(r) == 1:
        return r[0]['actor'], decode_emoji(r[0]['reaction']), None, None
    
    else:
        return r[0]['actor'], decode_emoji(r[0]['reaction']), r[1]['actor'], decode_emoji(r[1]['reaction'])
    
# parse reaction data
raw_data['actor1'], raw_data['reaction1'], raw_data['actor2'], raw_data['reaction2'] = \
    zip(*raw_data['reactions'].map(parse_reactions))
raw_data.head(3)

Unnamed: 0,audio_files,call_duration,content,files,gifs,missed,payment_info.amount,payment_info.completedTime,payment_info.creationTime,payment_info.currency,...,share.link,share.share_text,sticker.uri,timestamp_ms,type,videos,actor1,reaction1,actor2,reaction2
0,,,There are some techniques I wanna try that I f...,,,,,,,,...,,,,1550344792535,Generic,,Andrew Huang,😮,,
1,,,tbh I wanna have another go at fishy stickers,,,,,,,,...,,,,1550344764151,Generic,,,,,
2,,,:3 no worries,,,,,,,,...,,,,1550344755800,Generic,,,,,


# Importing Data from Cleaned HTML

In [14]:
from_html = pd.read_csv('res/html/dec-15-to-sep-18.csv')
from_html.columns

Index(['sender_name', 'timestamp', 'content', 'actor1', 'reaction1', 'actor2',
       'reaction2'],
      dtype='object')

In [15]:
from_html['sender_name'].value_counts()

Joyce Luong     62943
Andrew Huang    51671
Name: sender_name, dtype: int64

In [16]:
from_html.head(3)

Unnamed: 0,sender_name,timestamp,content,actor1,reaction1,actor2,reaction2
0,Andrew Huang,2018-09-19 18:01:00,Ok I guess I can download it on mine lol,Joyce Luong,😍,,
1,Joyce Luong,2018-09-19 18:00:00,It’s empty,,,,
2,Joyce Luong,2018-09-19 18:00:00,It won’t pull,,,,


# Combining Datasets

In [17]:
data = pd.concat([from_html, raw_data], sort=False)
data.reindex()
data.columns

Index(['sender_name', 'timestamp', 'content', 'actor1', 'reaction1', 'actor2',
       'reaction2', 'audio_files', 'call_duration', 'files', 'gifs', 'missed',
       'payment_info.amount', 'payment_info.completedTime',
       'payment_info.creationTime', 'payment_info.currency',
       'payment_info.receiverName', 'payment_info.senderName', 'photos',
       'plan.timestamp', 'plan.title', 'reactions', 'share.link',
       'share.share_text', 'sticker.uri', 'timestamp_ms', 'type', 'videos'],
      dtype='object')

In [18]:
data['sender_name'].value_counts()

Joyce Luong     82128
Andrew Huang    67591
Name: sender_name, dtype: int64

## Creating Date Column

In [19]:
data['timestamp_ms'] = pd.to_datetime(data['timestamp_ms'], unit='ms') \
                 .dt.tz_localize('UTC') \
                 .dt.tz_convert('America/Los_Angeles')
        
data['timestamp_ms'] = data['timestamp_ms'].dt.strftime('%a, %b %d, %Y @ %r')

In [20]:
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['timestamp'] = data['timestamp'].dt.strftime('%a, %b %d, %Y @ %r')

In [21]:
from dateutil import parser
temp = data['timestamp_ms'] + data['timestamp']

def ugh(s):
    if s[:3] == 'NaT':
        return pd.to_datetime(s[3:], format='%a, %b %d, %Y @ %I:%M:%S %p')
    else:
        return pd.to_datetime(s[:-3], format='%a, %b %d, %Y @ %I:%M:%S %p')
    
rip = temp.apply(ugh)

In [None]:
data['date'] = rip
data.reset_index(inplace=True)

In [None]:
print('kill me')

# Initial Impressions

In [None]:
# # only look at 'generic' messages
# data = combined[combined['type'] == 'Generic'].copy()

# print("NOTE: Lost these many non-generic rows: {} or {}%". \
#       format(len(raw_data) - len(data), round(100*(len(raw_data) - len(data))/len(raw_data), 3)))

# slice out columns of interest
data = data.loc[:, ['sender_name', 'date', 'content', 'actor1', 'reaction1', 'actor2', 'reaction2']]
data = data.sort_values('date', ascending=False)

data.head(5)

## Number of Messages Sent Per Person

In [None]:
data['sender_name'].value_counts()

In [None]:
data['month'] = data['date'].dt.month
data['hour'] = data['date'].dt.hour
data['year_month'] = data['date'].dt.strftime('%Y-%m')
data.head()

## Removing December 30, 2018 Fight

In [None]:
# data = data[~((data['date'].dt.year == 2018) & (data['date'].dt.month == 12) & (data['date'].dt.day == 30))]
# data['sender_name'].value_counts()

In [None]:
ax = sns.countplot(data['date'].dt.year)
ax.set_ylabel("Year")
ax.set_xlabel("Number of Messages")
ax.set_title("Number of Messages By Year");

In [None]:
ax = sns.countplot(data['date'].dt.month)
ax.set_xlabel("Month")
ax.set_ylabel("Number of Messages")
ax.set_title("Number of Messages By Month");

In [None]:
data['hour'] = data['date'].dt.hour
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax = sns.countplot(x='hour',hue='sender_name',data=data)
ax.set_title("Number of Messages By Hour");

In [None]:
year_month = sorted(data['year_month'].unique()) # order to display the x-axis
ax = sns.countplot(x='year_month', hue='sender_name', data=data, order=year_month)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_title("Number of Messages By Month");

In [None]:
count_chars = lambda row: sum([len(str(i)) for i in row])
reduced = data.loc[:, ['year_month', 'sender_name', 'content']]

char_len = reduced.groupby([reduced['year_month'], reduced['sender_name']]).agg(count_chars)
char_len.reset_index(inplace=True)

In [None]:
ax = sns.barplot(x='year_month', y='content', hue='sender_name', data=char_len)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_title("Character Count by Month");
ax.set_ylabel("Characters")
ax.set_xlabel("Month");

In [None]:
love = data[data['content'].str.contains('love', na=False)]
love = love.loc[:, ['content']]

count = lambda row: sum([len(str(i)) for i in row])
love = love.groupby([reduced['year_month'], reduced['sender_name']]).count()
love.reset_index(inplace=True)

ax = sns.barplot(x='year_month', y='content', hue='sender_name', data=love)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_title("How Many Times 'Love' Was Used");
ax.set_ylabel("Usage")
ax.set_xlabel("Month");

In [None]:
sex = data[data['content'].str.contains('sex', na=False)]
sex = sex.loc[:, ['content']]

count = lambda row: sum([len(str(i)) for i in row])
sex = sex.groupby([reduced['year_month'], reduced['sender_name']]).count()
sex.reset_index(inplace=True)

ax = sns.barplot(x='year_month', y='content', hue='sender_name', data=sex)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_title("How Many Times 'Sex' Was Used");
ax.set_ylabel("Usage")
ax.set_xlabel("Month");

In [None]:
data.head()

In [None]:
# To save everything to .json
# data.to_json('output/feb-17-2019.json')