In [2]:
import numpy as np
import pandas as pd
import string
import re

#sklearn imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

#scipy imports
from scipy.sparse import hstack

#Visualization imports
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import bokeh
#! pip install bokeh

# target classes
target_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']



In [3]:
# read frames localy through csv
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

# Random index generator for splitting training data
# Note: Each rerun of cell will create new splits.
randIndexCut = np.random.rand(len(train_df)) < 0.7

#S plit up data
test_data = test_df["comment_text"]
dev_data, dev_labels = train_df[~randIndexCut]["comment_text"], train_df[~randIndexCut][target_names]
train_data, train_labels = train_df[randIndexCut]["comment_text"], train_df[randIndexCut][target_names]


print('total training observations:', train_df.shape[0])
print('training data shape:', train_data.shape)
print('training label shape:', train_labels.shape)
print('dev label shape:', dev_labels.shape)
print ('labels names:', target_names)

total training observations: 159571
training data shape: (112098,)
training label shape: (112098, 6)
dev label shape: (47473, 6)
labels names: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [4]:
from nltk.corpus import brown

word_corpus = '../data/words.txt'
word_file = open(word_corpus, 'rt')
large_word_corpus = word_file.read()
word_file.close
large_word_corpus = large_word_corpus.split()
large_word_corpus = [ word.lower() for word in large_word_corpus]
large_word_corpus = set(large_word_corpus)

good_words_list = brown.words()
good_word_set = set([word.lower() for word in good_words_list])
#punctuation = re.sub("[\'\-]",'',string.punctuation)
punctuation = "[\!\?\"\#\$\%\&\(\)\*\+\,\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\']"
print('Size of corpus ' + str(len(large_word_corpus)))

Size of corpus 462986


In [5]:
# from http://norvig.com/spell-correct.html
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('../data/big.txt').read()))

def norvig_P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def norvig_correction(word): 
    "Most probable spelling correction for word."
    return max(norvig_candidates(word), key=norvig_P)

def norvig_candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [31]:
# Functions to support finding and correcting spellings
# using pyenchant for spell checking
from enchant import DictWithPWL
from enchant.checker import SpellChecker
import difflib
# import splitter # not useful, does a worse job than my implementation

my_dict=DictWithPWL('en_US', "../data/mywords.txt")
my_checker = SpellChecker(my_dict)

def clean_slang_abbreviations(textblock):
    # u -> you
    # c -> see
    # k -> okay
    return_words = textblock
    #return_words = re.sub(r"[^A-Za-z0-9,\!\?\*\.\;\’\´\'\\\/]", " ", return_words)
    return_words = return_words.lower()
    return_words = re.sub(",","",return_words)
    return_words = re.sub("\.","",return_words)
    return_words = re.sub(r"\("," ", return_words)
    return_words = re.sub(r"\)"," ", return_words)
    return_words = re.sub(r"´", "'", return_words)
    return_words = re.sub(r"`", "'", return_words)
    return_words = re.sub(r" '", " ", return_words)
    return_words = re.sub(r"' ", " ", return_words)
    return_words = re.sub(r"\!\!*", "\!Many", return_words)
    return_words = re.sub(r"\?\?*", "\?Many", return_words)
    return_words = re.sub(r"\!", " !", return_words)
    return_words = re.sub(r"\?", " \?", return_words)
    return_words = re.sub(r"n't", " not", return_words)
    return return_words

def trysplit(word):
    split_candidates = []
    max_proba = 0.0
    for i in range(1,len(word)):
        # I will only allow single letters of 'a' and 'i', all others ignored.  Pyenchant allows for
        # any single letter to be a legitimate word, and so too does norvig.  The dictionary defines
        # them as nouns that represent the letter, however even though several can be used in slang
        # (e.g. k->okay, c->see, u->you) using them in conjoined words would make the splitting far
        # too difficult and also human understanding much more difficult #howucthisk, u c?
        if (len(word[:i]) != 1 or (word[:i].lower() == 'a' or word[:i].lower() == 'i')) and (
            len(word[i:]) != 1 or (word[i:].lower() == 'a' or word[i:].lower() == 'i')):
            if my_checker.check(word[:i]) and my_checker.check(word[i:]):
                norvig_score = norvig_P(word[:i]) + norvig_P(word[i:])
                if norvig_score > max_proba:
                    max_proba = norvig_score
                    split_candidates = [word[:i],word[i:]]
    for i in range(1,len(word)):
        for j in range(i+1,len(word)):        
            if (len(word[:i]) != 1 or (word[:i].lower() == 'a' or word[:i].lower() == 'i')) and (
                len(word[i:j]) != 1 or (word[i:j].lower() == 'a' or word[i:j].lower() == 'i')) and (
                len(word[i:]) != 1 or (word[i:].lower() == 'a' or word[i:].lower() == 'i')):
                
                if my_checker.check(word[:i]) and my_checker.check(word[i:j]) and my_checker.check(word[j:]):
                    norvig_score = norvig_P(word[:i]) + norvig_P(word[i:j]) + norvig_P(word[j:])
                    if norvig_score > max_proba:
                        max_proba = norvig_score
                        split_candidates = [word[:i],word[i:j],word[j:]]
    for i in range(1,len(word)):
        for j in range(i+1,len(word)):
            for k in range(j+1,len(word)):
                if (len(word[:i]) != 1 or (word[:i].lower() == 'a' or word[:i].lower() == 'i')) and (
                    len(word[i:j]) != 1 or (word[i:j].lower() == 'a' or word[i:j].lower() == 'i')) and (
                    len(word[j:k]) != 1 or (word[j:k].lower() == 'a' or word[j:k].lower() == 'i')) and (
                    len(word[k:]) != 1 or (word[k:].lower() == 'a' or word[k:].lower() == 'i')):
#                     print("making it here with i=%s j=%s k=%s %s  max_proba=%d" %(word[:i],word[i:j],word[j:k],word[k:], max_proba))
#                     print("lengths are %d %d %d %d" % (len(word[:i]), len(word[i:j]),len(word[j:k]),len(word[k:])))
                    if my_checker.check(word[:i]) and my_checker.check(word[i:j]) and my_checker.check(word[j:k]) and my_checker.check(word[k:]):
#                         print('found words ' + word[i:] + ' ' + word[k:])
                        norvig_score = norvig_P(word[:i]) + norvig_P(word[i:j]) + norvig_P(word[j:k]) + norvig_P(word[k:])
                        if norvig_score > max_proba:
#                             print("found higher probability %d with %s %s %s %s" % (norvig_score, word[:i], word[i:j], word[j:k], word[k:]))
                            max_proba = norvig_score
                            split_candidates = [word[:i],word[i:j],word[j:k],word[k:]]
    return split_candidates

def fix_spelling_errors(textdoc):
    words = textdoc.split()
    return_list = []
    for word in words:
        if large_word_corpus.intersection(set([re.sub(punctuation,'',word.lower())])):
            return_list.append(word)
        else:
            # word is not found in the dictionary, try to correct the spelling
            if word == spell(word): # no changes made by the spell checker
                return_list.extend(trysplit(word))
            else:
                return_list.append(spell(word))
    return return_list

def get_best_candidates(word):
    best_words = []
    best_ratio = 0
    a = set(my_checker.suggest(word))
    for b in a:
        tmp = difflib.SequenceMatcher(None, word, b).ratio()
        if tmp > best_ratio:
            best_words=[b]
            best_ratio=tmp
        elif tmp == best_ratio:
            best_words.append(b)
    return best_words
    
def fix_spellings(textinput):
    words = textinput.split()
    return_list = []
    for word in words:
        if my_checker.check(word):
            return_list.append(word)
            # continue
        else:
            if len(norvig_candidates(word)) == 1 and norvig_candidates(word).pop() == word:
                # this is suspicious, pyenchants' "suggest" method always returns something, however if
                # norvigs method cannot find a suitable match within a short distance then it simply
                # returns the orignal word.  This section is for potentially conjoined words
                tmp_list=trysplit(word)
                
                # If we get back a list of split words then use these
                if len(tmp_list) != 0:
                    return_list.extend(tmp_list)
                    continue
            
            candidates = get_best_candidates(word)
            if len(candidates) == 1:
                    return_list.append(candidates)
            elif len(candidates) > 1:
                # try another spell checker
                candidates2 = norvig_candidates(word)
                tmp_len = len(candidates2.intersection(candidates))
                if tmp_len == 1:
                    # only 1 overlap, should be correct
                    print(candidates2.intersection(candidates))
                    return_list.append(candidates2.intersection(candidates).pop())
                else:
                    # arbitrary now, just going to use the first one found
                    return_list.append(candidates[0])
    return return_list

# myword='In a long discussion about thisismessedup what should I do askd'
# print(fix_spellings(myword))

In [23]:
myword2='findthiscara'
# trysplit(myword)
#norvig_P('is')
#fix_spellings('alit')
# help(enchant)
print(get_best_candidates("I'm"))

["I'm"]


In [32]:
index=8
print(train_data[index])
print(fix_spellings(clean_slang_abbreviations(train_data[index])))

Sorry if the word 'nonsense' was offensive to you. Anyway, I'm not intending to write anything in the article(wow they would jump on me for vandalism), I'm merely requesting that it be more encyclopedic so one can use it for school as a reference. I have been to the selective breeding page but it's almost a stub. It points to 'animal breeding' which is a short messy article that gives you no info. There must be someone around with expertise in eugenics? 93.161.107.169
{'ism'}
{'ism'}


AttributeError: 'list' object has no attribute 'intersection'