# Imports

In [1]:
import json
import os
import bz2
import io
from bz2 import BZ2File
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta

import seaborn as sns
sns.set_context('paper')
sns.set_style("white")
sns.set(rc = {'figure.figsize':(12,9)})

import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None  # default='warn', Mutes warnings when copying a slice from a DataFrame.


# Load dataset

In [None]:
df_cleaned = pd.read_json('cleaned.json.bz2', compression = 'bz2')
df_cleaned['journal'] = df_cleaned['journal'].astype('category')
df_cleaned.head()

In [None]:
df_cleaned = df_cleaned.drop(columns = 'id')

In [None]:
df_cleaned.index = np.arange(len(df_cleaned))

## Data cleaning


In [None]:
def keywords_count(df, keywords):
    """ Select text that contains certain keywords and count the latter 
    
    Inputs:
        df(pandas DataFrame): contains all articles in the 'text' column
        keywords(list<str>): list of keywords to search
        
    Output:
        counts(list<list<int>>): count of all keywords in all articles
    """
    
    counts, k = [], []
    
    for keyword in keywords:
        k.append(keyword.lower())
    
    for ind, row in df.iterrows():
        
        counts_ = []
        for k_ in k:
            counts_.append(len(re.findall(k_, row['text'].lower())))
        
        counts.append(counts_)
                
    return counts

In [None]:
nuclear_keywords = ['nucléaire',
                    'centrale nucléaire',
                    'lucens',
                    'uranium',
                    'fission',
                    'atome',
                    'Beznau',
                    'Gösgen',
                    'Leibstadt',
                    'Mühleberg',
                    'réacteurs', 
                    'déchets nucléaires',
                    'accident nucléaire'
                    ]

In [None]:
counts_garbage = keywords_count(df_cleaned, nuclear_keywords)
counts_garbage = np.asarray(counts_garbage).T

In [None]:
for i in range(len(nuclear_keywords)):
    df_cleaned['keyword_' + nuclear_keywords[i]] = counts_garbage[i]

In [None]:
for i in range (len(nuclear_keywords)):
    df_cleaned = df_cleaned[df_cleaned['keyword_' + nuclear_keywords[i]] == 0]

In [None]:
len(df_cleaned)

In [None]:
counts = keywords_count(df_cleaned, nuclear_keywords)
counts = np.asarray(counts).T

In [None]:
for i in range(len(nuclear_keywords)):
    df_cleaned['keyword_' + nuclear_keywords[i]] = counts[i]

In [None]:
df_corpus = df_cleaned
len(df_corpus)

# Export for iramuteq

In [None]:
df_corpus.head()

In [None]:
def formate_text(df):
    
    text = ''
    for row in df.iterrows():
        text += '**** *' + row[1][0] + ' *' + \
            str(row[1][1])[:10] + ' *' + str(row[1][2]) + ' *' + \
            str(row[1][5]) + '\n' + str(row[1][4]) + '\n'
        
    return text

In [None]:
def iramuteq_export(df, filename):
    
    text = formate_text(df)
    file = open(filename,'w') 
    file.write(text)
    file.close()
    
    return True

In [None]:
iramuteq_export(df_corpus, 'corpus.txt')