In [1]:
from textblob import TextBlob
import pandas as pd
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import altair as alt
alt.data_transformers.disable_max_rows()
import numpy as np
from LOCO_TLPA import config

In [8]:
# kw_model = KeyBERT()
# kw_model.extract_keywords(docs=[txt], use_maxsum=True, vectorizer=KeyphraseCountVectorizer(), stop_words="english", top_n=20)

In [70]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename="preprocessing.log")

In [2]:
df = pd.read_csv(
    "data/enron/raw.csv",
    usecols=["Message-ID", "Date", "content"],
    parse_dates=["Date"],
).rename(columns={"Message-ID": "document", "Date": "date"})
df["document"] = df["document"].str[1:23].str.replace(".", "")


In [3]:
metadata = pd.read_csv("data/enron/metadata.csv", parse_dates=["date"])
metadata = metadata[(metadata["date"] < pd.to_datetime(config["start_date"] + pd.Timedelta("1W"))) & (metadata["date"] >= pd.to_datetime(config["start_date"]))]
df = pd.merge(metadata, df, on="document")


In [4]:
from nltk.tokenize import RegexpTokenizer

docs = df["content"].dropna().astype(str).str.lower().to_list()
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

In [5]:

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# # Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [6]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [7]:
from nltk.corpus import stopwords
sw = stopwords.words('english') + ["forwarded", "am", "pm", "from", "cc", "bcc", "subject", "forward", "mailreply", "com", "org", "gmt", "mail"] + ["ect"]
docs = [[token for token in doc if token  not in sw] for doc in docs]

In [8]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [9]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
# dictionary.filter_extremes(no_below=20, no_above=0.5)

In [10]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))


Number of unique tokens: 12667
Number of documents: 3601


In [11]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 5000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,  
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)


In [12]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -2.4252.
[([(0.011342138, 'enron'),
   (0.0063620745, 'wa'),
   (0.005222481, 'one'),
   (0.0047698887, 'would'),
   (0.00473559, 'market'),
   (0.0039872197, 'get'),
   (0.0038127557, 'ha'),
   (0.0035334574, 'power'),
   (0.003469485, 'year'),
   (0.003440364, 'game'),
   (0.0033879702, 'time'),
   (0.0033303588, 'said'),
   (0.0032698067, 'going'),
   (0.0032264045, 'california'),
   (0.0031395338, 'corp'),
   (0.0031207353, 'take'),
   (0.0030830982, 'price'),
   (0.0029907145, 'new'),
   (0.0029593937, 'go'),
   (0.0028403064, 'week')],
  -1.6865139274739742),
 ([(0.04180374, 'hou'),
   (0.030349214, 'enron'),
   (0.010210962, 'tana'),
   (0.008373747, 'know'),
   (0.008209231, 'tana_jones'),
   (0.008055073, 'jones'),
   (0.006586608, 'corp'),
   (0.0053297137, 'john'),
   (0.005285224, 'ee'),
   (0.005111249, 'wa'),
   (0.0050600986, 'mark'),
   (0.0049245115, 'would'),
   (0.0045485166, 'let'),
   (0.004533672, 'group'),
   (0.004404223, 'deal'),
   (0.

In [49]:
df["year"] = df["date"].dt.strftime("%Y-%m")
disp = df.groupby("year", as_index=False)['content'].count()
alt.Chart(disp).mark_bar(color="purple").encode(x="year:O", y="content").properties(width=900)

: 

In [28]:
df = df[df["date"].dt.year.isin(range(1997, 2005))]
df[["category", "date"]].to_csv("data/enron/metadata.csv", index=False)
df = df.set_index(["category", "date"])
df["content"] = (
    df["content"]
    .replace("‘", "'")
    .replace("’", "'")
    .replace("“", '"')
    .replace("”", '"')
)

In [32]:
pd.read_csv("results/enron/dvr.csv").head(20)

Unnamed: 0,element_code,element,global_weight
0,621888,ect,0.024199
1,660150,enron,0.018691
2,1309014,pm,0.010263
3,1598346,subject,0.008128
4,1307315,please,0.006271
5,747168,forwarded,0.005793
6,1651950,thanks,0.005455
7,1395143,re,0.004406
8,1066043,mark,0.004364
9,543905,dc,0.0038


In [29]:
dates = [x for x in pd.date_range(end=pd.datetime.today(), periods=1800)]
counts = [x for x in np.random.randint(0, 10000, size=1800)]
df = pd.DataFrame({'dates': dates, 'counts': counts}).set_index('dates')

In [13]:
l = []
for i, it in enumerate(df.iterrows()):
    if i <= 500000:
        continue
    index, row = it
    npc = TextBlob(str(row["content"])).np_counts
    if npc == {}:
        continue
    adf = pd.DataFrame.from_records(list(npc.items()), columns=["element", "frequency_in_category"])
    adf["category"] = index[0]
    l.append(adf)
concated = pd.concat(l)
concated.to_csv(f"data/enron/np_freq/{i}.csv")
l = []