# Import Libraries, Import Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import re

# Create the nlp object
nlp = spacy.load('en_core_web_lg')

# increase the amount of text to display
pd.options.display.max_colwidth = 500

In [8]:
# Import data
tales = pd.read_csv('../data/tales.csv')

---
# Clean Data

In [9]:
## drop missing values in 'selftext' ##
tales.dropna(inplace = True)

## drop columns ##
drop_cols = ['created_utc', 'author', 'score', 'is_self']
tales.drop(columns = drop_cols, inplace = True)

## convert timestamp to datetime ##
tales['timestamp'] = pd.to_datetime(tales['timestamp'])

## drop ['deleted'] from 'selftext' ##
tales.drop(tales[tales['selftext'] == '[deleted]'].index, inplace = True)

---
# Functions

In [4]:
# get word count for each document
def word_count(string):
    words = string.split()
    return len(words)

# average word length in doc
def avg_word_length(string):
    words = string.split()
    word_lengths = [len(word) for word in words]
    avg_word_length = sum(word_lengths)/len(words)
    return(avg_word_length)

# create lemmatized, alpha-only text, stopwords removed
stopwords = spacy.lang.en.stop_words.STOP_WORDS
def spacy_clean(string):
    doc = nlp(string)
    lemmas = [token.lemma_ for token in doc]
    lemmas_clean = [lemma for lemma in lemmas if lemma.isalpha() and lemma not in stopwords]
    return ' '.join(lemmas_clean)

---
# Text Processing

In [10]:
## TITLE_TEXT: minimally processed text data ##

# lowercase
tales['title_text'] = tales['selftext'].str.lower()

# replace youtube links with 'youtube'
tales['title_text'] = tales['title_text'].apply(lambda s: ' '.join(re.sub(r'\S+youtube\S+', 'youtube', s).split()))

# remove links and references to subreddit name in text
tales['title_text'] = tales['title_text'].replace('http\S+', '', regex=True).replace('www\S+', '', regex=True).replace('tales\S+', '', regex=True)

# combine title with text
tales['title_text'] = tales['title'] + " \n" + tales['title_text']

## TITLE_TEXT_LEMMA: lemmatized text data
# lemmatize text, remove numbers and stopwords
tales['title_text_lemma'] = tales['title_text'].apply(spacy_clean)

---
# Features

In [11]:
# SELFTEXT: raw text data

# raw document length, character count
tales['char_length_selftext'] = tales['selftext'].str.len()

# raw document word count
tales['word_count_selftext'] = tales['selftext'].apply(word_count)

# raw document average word length
tales['avg_word_length_selftext'] = tales['selftext'].apply(avg_word_length)


# TITLE_TEXT: minimally processed text data

# document length, character count
tales['char_length_title_text'] = tales['title_text'].str.len()

# document word count
tales['word_count_title_text'] = tales['title_text'].apply(word_count)

# average doc word length
tales['avg_word_length_title_text'] = tales['title_text'].apply(avg_word_length)


# TITLE_TEXT_LEMMA: lemmatized text data

# character length based on lemma text column
tales['char_length_lemma'] = tales['title_text_lemma'].str.len()

# word count based on lemma text column
tales['word_count_lemma'] = tales['title_text_lemma'].apply(word_count)

# average doc word length
tales['avg_word_length_lemma'] = tales['title_text_lemma'].apply(avg_word_length)

---
# Save data file

In [12]:
tales.to_csv('../data/tales_clean.csv', index = False)