In [None]:
from IPython.core.debugger import set_trace
from bs4 import BeautifulSoup
import pandas as pd
import re
import lxml
from wordcloud import WordCloud
import matplotlib.pyplot as plt

#### Parse Search file and save to pandas df.

In [None]:
%%time
f = 'data/google_history/My Activity/Search/My Activity.html'

contents = open(f, 'r')
soup = BeautifulSoup(contents,'lxml')
contents.close()

In [None]:
divs = soup.find_all('div', {'class': ['content-cell', 'mdl-cell','mdl-cell--6-col','mdl-typography--body-1']})

In [None]:
d = []
for div in divs:
    d.append(div.text)

In [None]:
d

In [None]:
d = d[::5]

In [None]:
data = {}
data['type'] = []
data['action'] = []
data['text'] = []
data['date'] = []

for i in range(len(d)-1):
    
    try:
        data['type'].append(d[i][:6])
    except:
        data['type'].append('No type')
        
    try:
        data['action'].append(d[i].split('\xa0')[0][6:])
    except:
        data['action'].append('No action')
    
    _full = d[i].split('\xa0')[1].split(',')[0]
    _text = d[i].split('\xa0')[1].split(',')[0][:-10]
    _match = re.search(r'\d+$', _text)

    if _match:
        data['text'].append(_text[:-1])
    else:
        data['text'].append(_text[:])
    
    try:
        data['date'].append(_full[-8:])
    except:
        data['date'].append('No date')   

In [None]:
data

In [None]:
df = pd.DataFrame.from_dict(data)

In [None]:
df.to_csv('data/google_history/my_search_history.csv')

#### Load preprocessed data

In [None]:
df = pd.read_csv('data/google_history/my_search_history.csv')

In [None]:
df["text"] = df.text.map(str)

In [None]:
def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

In [None]:
df["text"] = df.text.map(lambda x: remove_URL(x))

In [None]:
df = df[df.text !='']

In [None]:
import string

def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

In [None]:
df["text"] = df.text.map(lambda x: remove_punct(x))

In [None]:
df['date'] = pd.to_datetime(df['date'], format='%b %Y', errors='coerce')

In [None]:
df = df.copy()
df["title_words_count"] = df.text.str.split().map(lambda x: len(x))

In [None]:
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

In [None]:
df.head(4)

Generating n_grams from whole corpus:

In [None]:
from nltk.corpus import stopwords

def generate_ngrams(text, n_gram=1, stop=True):
    """
    Simple n-gram generator.
    """
    stop = set(stopwords.words("english")) if stop else {}

    token = [
        token for token in text.lower().split(" ") if token != "" if token not in stop
    ]
    z = zip(*[token[i:] for i in range(n_gram)])
    ngrams = [" ".join(ngram) for ngram in z]

    return ngrams

In [None]:
from collections import defaultdict

unigrams = defaultdict(int)
bigrams = defaultdict(int)
trigrams = defaultdict(int)
fourgrams = defaultdict(int)

for text in df.text:
    for word in generate_ngrams(text, n_gram=1):
        unigrams[word] += 1
        
for text in df.text:
    for word in generate_ngrams(text, n_gram=2):
        bigrams[word] += 1
        
for text in df.text:
    for word in generate_ngrams(text, n_gram=3):
        trigrams[word] += 1
        
for text in df.text:
    for word in generate_ngrams(text, n_gram=4):
        fourgrams[word] += 1

In [None]:
unigrams = pd.DataFrame.from_dict(unigrams,orient='index').reset_index()
bigrams = pd.DataFrame.from_dict(bigrams,orient='index').reset_index()
trigrams = pd.DataFrame.from_dict(trigrams,orient='index').reset_index()
fourgrams = pd.DataFrame.from_dict(fourgrams,orient='index').reset_index()

In [None]:
trigrams.head(4)

In [None]:
allgrams = pd.concat([bigrams, trigrams, fourgrams])
allgrams.sort_values(0, ascending=False, inplace=True)

#underlines for ngrams, doesn't do anything for unigrams
allgrams['index'] = allgrams['index'].replace(' ', '_', regex=True)

corpus = allgrams['index'].to_list()

In [None]:
word_cloud = WordCloud(background_color="white", max_font_size=50, min_word_length=3, scale=2).generate(
    " ".join(corpus[:50])
)

In [None]:
plt.figure(1, figsize=(16, 8))
plt.subplot(1, 1, 1)
plt.imshow(word_cloud,interpolation='bilinear')


In [None]:
agg_df = df.groupby('year').agg({'text': lambda x: x.tolist()}).reset_index()

agg_df['text'] = agg_df.text.map(lambda x: [i.replace(' ', '_') for i in x])

agg_df['text'] = agg_df.text.map(lambda x: [i for i in x if '_' in i])

In [None]:
yearly_data = {}
for i in range(len(agg_df)):
    sub = pd.DataFrame({'text':agg_df.iloc[i].text})
    yearly_data[str(agg_df.iloc[i].year)] = sub.text.value_counts()[:25].to_dict()

In [None]:
for i,k in enumerate(yearly_data.keys()):
    word_cloud = WordCloud(background_color="white", max_font_size=40, min_word_length=3, scale=2).generate(
        " ".join(yearly_data[k])
    )  
    plt.figure(i, figsize=(16, 8))
    plt.suptitle(k)
    plt.subplot(1, 1, 1)
    plt.imshow(word_cloud,interpolation='bilinear')