In [3]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Birgit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
articles = pd.read_json("data/articles.json.xz")
google_qa = pd.read_json("data/google_qa.json.xz")
jokes = pd.read_json("data/jokes.json.xz")

In [7]:
ajalehed = ["Washington Post", "The New York Times", "Reuters"]

### How many times are newspaper names mentioned in articles?

In [15]:
articles.head()

Unnamed: 0,text
0,The New York Times reported on Thursday that T...
1,"Entitled “Passions”, the autobiography documen..."
2,A so-called tiered deposit rate would mean ban...
3,Air Force Brigade General Miguel Sisco Mora wa...
4,Much of Hudson’s Bay’s value is locked up in i...


In [11]:
ajaleht_mainitud = 0

for i, row in articles.iterrows():
    #print(row['text'])
    #break
    
    text = row['text']
    
    for ajaleht in ajalehed:
        if ajaleht in text:
            ajaleht_mainitud += 1
    

In [12]:
ajaleht_mainitud

91021

In [16]:
ajaleht_mainitud / articles.shape[0]

0.08563885488692163

The next three parts are about finding the most popular words in articles, Google Q&As and jokes. This is a quick way by splitting sentences by spaces. This is obviously not ideal as punctuations will not be separated, but it's not meant to be ideal. Actual lemmatizing takes a long time and this is only to get a quick idea, as the models showed weird behaviour which led us to believe there's more preprocessing needed to be done.

### Most popular words in articles

In [19]:
words_article = []

for i, row in articles.iterrows():
    text = row['text']
    words = text.split(" ")
    words_article.extend([word.lower() for word in words])

In [20]:
from collections import Counter

In [26]:
stopwords = stopwords.words('english')

In [27]:
words_article_withoutstopwords = [word for word in words_article if word not in stopwords]

In [31]:
counter_article = Counter(words_article_withoutstopwords).most_common()[:100]

In [32]:
print(counter_article[:10])

[('said', 643277), ('would', 254174), ('new', 246051), ('', 227322), ('u.s.', 208649), ('last', 200556), ('percent', 199796), ('said.', 196089), ('also', 188543), ('one', 173930)]


In [30]:
len(words_article)

105846968

### Most popular words in Google Q&A-s

In [33]:
words_google_qa = []

for i, row in google_qa.iterrows():
    text = row['text']
    words = text.split(" ")
    words_google_qa.extend([word.lower() for word in words])

In [34]:
len(words_google_qa)

56874389

In [35]:
words_google_qa_withoutstopwords = [word for word in words_google_qa if word not in stopwords]

In [36]:
counter_google_qa = Counter(words_google_qa_withoutstopwords).most_common()[:100]

In [44]:
print(counter_google_qa[:10])

[('.', 1358537), ('-', 1039596), (',', 522924), ('/', 281690), ('°', 195374), ('\ufeff', 162223), ('♠', 139637), ('0000', 116224), (':', 114611), ('new', 112951)]


### Most popular words in jokes

In [37]:
words_jokes = []

for i, row in jokes.iterrows():
    text = row['text']
    words = text.split(" ")
    words_jokes.extend([word.lower() for word in words])

In [38]:
len(words_jokes)

34729322

In [40]:
words_jokes_withoutstopwords = [word for word in words_jokes if word not in stopwords]

In [41]:
counter_jokes = Counter(words_jokes_withoutstopwords).most_common()[:100]

In [42]:
print(counter_jokes[:10])

[('', 388064), ('man', 158329), ('one', 127041), ('says', 94922), ('get', 94822), ('like', 87498), ('said', 71113), ('got', 61645), ('says,', 58998), ('back', 58791)]


### Lemmatizing and removing stop-words

In [17]:
datas = {}
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stop_words.extend(["...", "'s", "wo", "n't", "'m", "ca", "'ll", "'re", "'ve", "'d"])

all_lem_data = []
for name, data, n in zip(['articles', 'google_qa', 'jokes'], [articles, google_qa, jokes], [50000, 50000, 100000]):
    print(name, end="\n\n\n")
    lemmatized_data = []
    for i, row in data.iterrows():
        if i == n:
            break
        if i % 10000 == 0:
            print(i)
        text = row['text']
        words = word_tokenize(text)
        lemmas = []

        for word in words:
            lemma = lemmatizer.lemmatize(word)
            if len(lemma) > 1 and not lemma in stop_words:
                lemmas.append(lemma.lower())
        lemmatized_row = {'id': f"{name}_{i}",'text': " ".join(lemmas), 'joke': name == 'jokes'}
        lemmatized_data.append(lemmatized_row)
        all_lem_data.append(lemmatized_row)
    datas[name] = pd.DataFrame(lemmatized_data)


articles


0
10000
20000
30000
40000
google_qa


0
10000
20000
30000
40000
jokes


0
10000
20000
30000
40000
50000
60000
70000
80000
90000


In [18]:
for key in datas:
    data = datas[key]
    data.to_json(path_or_buf=f"{key}_lemmas.json")

In [19]:
all_lem_df = pd.DataFrame(all_lem_data)
all_lem_df.to_json(path_or_buf="all_lemmas.json")
