#Data Cleaning: Comment Level

##What this bit-o-code does:
Take .json comment (story) data scrapped from /r/WritingPrompts via perl, clean out the absurd data errors, and output a usable .csv for analysis.

Note: this file is a Python version (translation?) of the data cleaning process originally implemented in R. The R version of this file can be found here.

In [109]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import re
import os
import codecs
import os.path
import seaborn as sns
import textstat as textstat
from curses.ascii import isdigit
from sklearn import feature_extraction
import statsmodels.formula.api as smf  #because ols
import statsmodels.api as sm

In [110]:
%pylab inline
pd.options.mode.chained_assignment = None  # default='warn'
                                           # because recodes

Populating the interactive namespace from numpy and matplotlib


In [111]:
# Where is my data?
# NOTE: _pd indicates the python version of these files
dir_project       = '/home/marvin/Desktop/Insight/WritingPrompts/'
dir_in_prompt     = dir_project + 'PromptCSV/'
dir_in_story      = dir_project + 'ResponseCSV/'
data_prompt_main  = dir_project + 'allPrompt_pd.csv'
data_prompt_clean = dir_project + 'allPrompt_edited_pd.csv'
data_story_main   = dir_project + 'commentFile_reduce_pd.csv'
data_story_clean  = dir_project + 'commentFile_reduce_edited_pd.csv'

data_story_main

'/home/marvin/Desktop/Insight/WritingPrompts/commentFile_reduce_pd.csv'

## For github, select one month to work on
(Otherwise time / computation constraints kick in)

In [112]:
# create comment sample
allPrompt   = pd.read_csv(data_prompt_clean)

In [113]:
allPrompt.dtypes

Unnamed: 0               int64
p_selftext              object
p_id                    object
p_gilded               float64
p_archived              object
p_author                object
p_score                float64
p_over_18               object
p_edited                object
p_is_self               object
p_name                  object
p_url                   object
p_author_flair_text     object
p_title                 object
p_created_utc          float64
p_ups                  float64
p_num_comments         float64
p_genre                 object
p_editRec                int64
p_editTime             float64
p_isAmod                 int64
p_distinguished        float64
p_year                   int64
p_month                  int64
p_day                    int64
p_wDay                   int64
p_prompt_length          int64
p_all_length             int64
dtype: object

In [114]:
sampleSet = allPrompt[allPrompt['p_year']==2014]
sampleSet = sampleSet[sampleSet['p_month']==7]

In [115]:
sampleSet.shape

(7097, 28)

## Another Import Loop!
Stories-in-prompts were pulled by prompt, excluding any prompt that had no story-level responses. This resulted in approximately 70k files. 

To combine:
loop over these files to create a single large dataset for analysis. Because this is time consuming, save output to a file. If re-running, check if this file exists to save some time.

In [116]:
# check to make sure that the files exist
#  will want to skip comment files that were not pulled
#   --- for example: threads that were deleted (null entries)
i = 0
sampleSet['p_exists'] = 0
for i in range(0, sampleSet.shape[0]):
    sampleSet['p_exists'].iloc[i] = os.path.isfile(dir_in_story + sampleSet['p_id'].iloc[i] + '.txt')

In [117]:
sampleSet['file'] = sampleSet['p_id'] + '.txt'
sampleSet.groupby('p_exists').size()

p_exists
False    1903
True     5194
dtype: int64

In [118]:
i = 0
tempPath    = dir_in_story + sampleSet['p_id'].iloc[i] + '.txt'
commentFile = pd.read_csv(tempPath)

# Import loop!
forceImport = 1 #set to 0 to force a re-import
if((forceImport == 0) or not(os.path.isfile(data_story_main)) ):
    
    for i in range(1, sampleSet.shape[0]): #loop over the sample prompts
    
        if(sampleSet['p_exists'].iloc[i] == True):
            thisPath = dir_in_story + sampleSet['p_id'].iloc[i] + '.txt'
            commentFile  = commentFile.append(pd.read_csv(thisPath))
                    
    commentFile.to_csv(data_story_main)

    
# Import the combined csv file
commentFile = pd.read_csv(data_story_main)
commentFile.shape

(35085, 26)

## Talking Body
Clean the stories (stored in 'body'), and then calculate some features

In [119]:
commentFile['body'] = commentFile['body'].str.replace('\\u201c', '')
commentFile['body'] = commentFile['body'].str.replace('\\u201d', '')
commentFile['body'] = commentFile['body'].str.replace('\\u2019d', '')
commentFile['body'] = commentFile['body'].str.replace('\\u2019t', '')
commentFile['body'] = commentFile['body'].str.replace('\\u2019s', '')
commentFile['body'] = commentFile['body'].str.replace('\\[', '')
commentFile['body'] = commentFile['body'].str.replace('\\]', '')

In [120]:
# syllable counter helper dictionary
d = nltk.corpus.cmudict.dict()

In [137]:
# count sentences
commentFile['sentence_count'] = 0
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
#       words
commentFile['word_count'] = 0
#       syllables
commentFile['syllable_count'] = 0
#       exclamations
commentFile['exclaimMark'] = 0
#       questions
commentFile['questionMark'] = 0
#       I-statements
commentFile['i_statement'] = 0
#       you-statements
commentFile['you_statement'] = 0
#       And vs and
commentFile['uc_and'] = 0
commentFile['lc_and'] = 0
# paragraph count
commentFile['paragraph_count']   = 0

for i in range(0, sampleSet.shape[0]):
    commentFile['sentence_count'].iloc[i] = len(sent_detector.tokenize(commentFile['body'].iloc[i].strip()))
    story_tokens = nltk.tokenize.word_tokenize(commentFile['body'].iloc[i])
    commentFile['word_count'].iloc[i]     = len(story_tokens)
    
    story_syl = 0
    for my_word in story_tokens:
        thisList = d.get(my_word.lower())
        if thisList is None:
            thisList = {}

        story_syl += len(thisList)

    commentFile['syllable_count']  = story_syl
    commentFile['exclaimMark']     = commentFile['body'].iloc[i].count('!')
    commentFile['questionMark']    = commentFile['body'].iloc[i].count('?')
    commentFile['i_statement']     = commentFile['body'].iloc[i].count(' I ')
    commentFile['you_statement']   = commentFile['body'].iloc[i].count(' you ') + commentFile['body'].iloc[i].count(' You ')
    commentFile['uc_and']          = commentFile['body'].iloc[i].count(' And ')
    commentFile['lc_and']          = commentFile['body'].iloc[i].count(' and ')
    commentFile['paragraph_count'] = commentFile['body'].iloc[i].count('\n') 
    

In [122]:
# convert UC into a percentage
tempRec = commentFile['lc_and']
tempRec[tempRec == 0] = 1
allPrompt['lc_and'] = tempRec
commentFile['perStartAnd']    = commentFile['uc_and'] / (commentFile['uc_and'] + commentFile['lc_and'])

In [123]:
# convert questions, etc into percentages (rough)
commentFile['perQ'] = commentFile['questionMark'] / commentFile['sentence_count']
commentFile['perE'] = commentFile['exclaimMark'] / commentFile['sentence_count']
commentFile['perI'] = commentFile['i_statement'] / commentFile['sentence_count']
commentFile['perU'] = commentFile['you_statement'] / commentFile['sentence_count']

In [124]:
# recode toplevel
recoder = { 't1' : 0,
            't3' : 1}
commentFile['topLevel'] = commentFile['parent_id'].str[0:2]
commentFile['topLevel'].replace(recoder, inplace=True)


In [132]:
# grade level
commentFile['fleschKincaid'] =  0.39 * (commentFile['word_count'] / commentFile['sentence_count']) + 11.80 * (commentFile['syllable_count'] / commentFile['word_count']) - 15.59