# Setup

In [None]:
import pandas as pd

from os import listdir
from os.path import isfile, join

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
import spacy
import re

# INSTALL FIRST (noted again when needed below):
# nltk.download('stopwords')
# python -m spacy download en_core_web_sm


In [None]:
# Raw data in, out data goes to main data folder
DATA_PATH = '../data/raw/'
OUTPUT_PATH = '../data/'

In [None]:
# Check files in data folder
datafiles = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]

print('Index, Filename')
print(list(zip([index for index, value in enumerate(datafiles)], datafiles)))

In [None]:
# Load df
load_file = datafiles[0]

df = pd.read_json(DATA_PATH + load_file, convert_dates=True, lines=True, orient='records')

In [None]:
df.head(3)

In [None]:
len(df)

In [None]:
df.columns

# Processing

In [None]:
# INSTALL FIRST: python -m spacy download en_core_web_sm
# Initialize spacy 'en' model
nlp = spacy.load("en_core_web_sm") # disable=['parser', 'ner']) ## if you need efficiency


In [None]:
# Column name where you want to do stuff
action_col = 'summary'

### Initial cleaning

In [None]:
def cleaning(text):
    cleaned_text = text.lower() # lower case
    cleaned_text = cleaned_text.replace("\n", " ")
    cleaned_text = cleaned_text.replace("\t", " ")
    cleaned_text = cleaned_text.replace('\r', '')

    cleaned_text = re.sub('[^\S\r\n]{2,}', ' ', cleaned_text) # extra spaces
    cleaned_text = cleaned_text.rstrip()
    return cleaned_text

In [None]:
df['cleaning'] = df[action_col].dropna().apply(lambda x: cleaning(x))
df.cleaning

### Remove punctuation

In [None]:
# Creat punctuation list

special_punctuation = '：，,《。》“„:一・«»”“]'

final_punctuation = string.punctuation + special_punctuation
final_punctuation

In [None]:
def remove_punctuation(txt):
    txt_nopunct = ''.join([c for c in txt if c not in final_punctuation])
    return txt_nopunct

In [None]:
df['cleaning'] = df['cleaning'].dropna().apply(lambda x: remove_punctuation(x))
df.cleaning

### Tokenize, lemmatize, drop POSs

In [None]:
# Words to keep even if they are not in POS
to_keep = ['disinformation'] #deepfake

In [None]:
# Lemmatization function

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    doc = nlp("".join(texts)) #nlp(sent) #" ".join(sent)) 
    texts_out = [token.lemma_ for token in doc if token.pos_ in allowed_postags or token.text in to_keep]
    return texts_out

In [None]:
df['lemma_text'] = df['cleaning'].dropna().apply(lambda x:  lemmatization(x, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']))


In [None]:
df['lemma_text']

In [None]:
# Not needed, unless no lemmaization is done
#df['tokens'] = df.dropna().apply(lambda row: nltk.word_tokenize(row['tokens']), axis=1)

### Remove stopwords

In [None]:
# Creat stopword list

# RUN
# nltk.download('stopwords')

lang = 'english'

stop_words = list(stopwords.words(lang))

# add in anything else we need to remove, eg. for some analysis, any search tags would be dropped from text
new_stop_words = [] 

final_stop_words = stop_words + new_stop_words

In [None]:
def remove_stopwords(txt):
    txt_nostops = [w for w in txt if not w in final_stop_words]
    #txt_nostops = ' '.join([w for w in txt if not w in stop_words]) # Alternate
    return txt_nostops

In [None]:
df['tokens'] = df['lemma_text'].dropna().apply(lambda x:  remove_stopwords(x))
df['tokens']

In [None]:
# Optional functions

def remove_numbers(txt):
    result = ''.join([i for i in txt if not i.isdigit()])    
    return result

def get_numbers(txt):
    x = re.findall(r'\d+', txt)
    return len(x)

def pos_count(pos, txt):
    x = [token for token in txt if token.endswith(pos)]
    #y = [token.split('/')[0] for token in x] # use when I need lists with just these!
    return len(x)
    #return y


# Get Tags

In [None]:
def get_tag(x):
      tag = x[0]['term']
      # TO DO - scrape https://arxiv.org/category_taxonomy to translate codes to plain english
      return tag

In [None]:
df['category'] = df['tags'].dropna().apply(lambda x:  get_tag(x))


# Get time periods

In [None]:
def get_period(x, period):
      #output = x.split(',')[period] ## For string splitting
      output = x[:period]
      if len(output) == 1:
            return output[0]
      else:
            return output

In [None]:
df['year'] = df['published_parsed'].dropna().apply(lambda x:  get_period(x, 1))
df['month_year'] = df['published_parsed'].dropna().apply(lambda x:  get_period(x, 2))



# Check output

In [None]:
df.head(2)

In [None]:
df.columns

In [None]:
df.drop('lemma_text', axis=1, inplace=True)
#df.drop('cleaning', axis=1, inplace=True)

In [None]:
out_file = load_file.split('.')[0]

In [None]:
out_file

In [None]:
df.to_csv(OUTPUT_PATH + out_file + '.csv', index=False)