In [1]:
import os
import pandas as pd

In [2]:
def load_data(path):
    full_path = os.path.join(os.path.realpath('..'), path)
    df = pd.read_csv(full_path, header=0, index_col=0)
    print("Dataset has {} rows, {} columns.".format(*df.shape))
    return df

In [28]:
df_train = load_data('data/raw/train.csv')
df_test = load_data('data/raw/test.csv')

Dataset has 159571 rows, 7 columns.
Dataset has 153164 rows, 1 columns.


## Data cleaning

In [29]:
# fill NaN with string "unknown"
df_train.fillna('unknown',inplace=True)
df_test.fillna('unknown',inplace=True)

## Create features

In [36]:
def create_features(df):
    "Create features as seen in EDA"
    print("Dataframe as {} rows and {} columns.".format(*df.shape))
    # Uppercase count
    df['processed'] = df['comment_text'].str.split()
    print("Counting uppercases...")
    df['uppercase_count'] = df['processed'].apply(lambda x: sum(1 for t in x if t.isupper() and len(t)>2))
    print("Dataframe as {} rows and {} columns.".format(*df.shape))
    
    # Bad words
    print("Counting bad words...")
    path = 'data/external/badwords.txt'
    bad_words = []
    f = open(os.path.join(os.path.realpath('..'), path), mode='rt', encoding='utf-8')
    for line in f:
        words = line.split(', ')
        for word in words:
            word = word.replace('\n', '')
            bad_words.append(word)
    f.close()

    df['bad_words'] = df['processed'].apply(lambda x: sum(1 for t in x if t in bad_words))
    print("Dataframe as {} rows and {} columns.".format(*df.shape))
    
    # Count of typos
    from enchant.checker import SpellChecker

    def typo_count(corpus):
        "Count the number of errors found by pyenchant"
        count = []
        for row in corpus:
            chkr = SpellChecker("en_US")
            chkr.set_text(row)
            i = 0
            for err in chkr:
                i += 1
            count.append(i)
        return count
    
    print("Counting typos...")
    df['typos'] = typo_count(df.comment_text)
    print("Dataframe as {} rows and {} columns.".format(*df.shape))
    
    # Doc length
    print("Counting length of each comment...")
    df['length'] = [len(t) for t in df['processed']]
    print("Dataframe as {} rows and {} columns.".format(*df.shape))
    
    # Drop processed (helper column)
    df = df.drop(['processed'], axis=1)
    print("Dataframe as {} rows and {} columns.".format(*df.shape))
    return df

In [35]:
df_train = create_features(df_train)
df_test = create_features(df_test)

## Spell check - TBC

In [8]:
import enchant
from enchant.checker import SpellChecker

In [22]:
from enchant.checker import SpellChecker

def spellcheck(corpus):
    "Spellcheck using pyenchant"
    for row in corpus:
        chkr = SpellChecker("en_US")
        chkr.set_text(row)
        for err in chkr:
            sug = err.suggest()[0]
            err.replace(sug)
            print(err.word, sug)
        row = chkr.get_text()
    return corpus

In [24]:
spellcheck(df_train.comment_text[:5])

username user name
Metallica Metallic
GAs Gas
FAC AC
D'aww D'art
colour color
UTC CUT
ie IE
eg g
Wikipedia Pediatric


id
0000997932d777bf    Explanation\nWhy the edits made under my usern...
000103f0d9cfb60f    D'aww! He matches this background colour I'm s...
000113f07ec002fd    Hey man, I'm really not trying to edit war. It...
0001b41b1c6bb37e    "\nMore\nI can't make any real suggestions on ...
0001d958c54c6e35    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

## Output

In [13]:
# save list to file
def save_list(lines, filename):
    # convert lines to a single blob of text data = '\n'.join(lines)
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

In [40]:
def save_df(df, path):
    full_path = os.path.join(os.path.realpath('..'), path)
    df.to_csv(full_path, header=True, index=True)
    print('Dataframe ({}, {}) saved as csv.'.format(*df.shape))

In [41]:
save_df(df_train, 'data/processed/train.csv')
save_df(df_test, 'data/processed/test.csv')

Dataframe (159571, 11) saved as csv.
Dataframe (153164, 5) saved as csv.
