In [1]:
with open('../datasets/RC_2018-02-28') as infile, open('../datasets/CHUNK_RC_2018-02-28', 'w') as outfile:
    for _ in range(10000):
        outfile.write(infile.readline())

In [2]:
import pandas as pd
from collections import defaultdict
from IPython.display import display, HTML

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [3]:
DATASET_PATH = '../datasets/CHUNK_RC_2018-02-28'

In [4]:
df = pd.read_json(DATASET_PATH, lines=True, chunksize=1e4).read()
df = df[(df.body != '[deleted]') & (df.body != '[removed]')]

Possible sentiment features: SentiStrength, Vader-Sentiment, LIWC. Both SentiStrength and LIWC are proprietary. LIWC also can be used to give many different psychological and language dimensions (if only it weren't proprietary).

TODO: learned Naive Bayes classifier

In [5]:
def vader_features(analyzer, body):
    vs = analyzer.polarity_scores(body)
    return {'vad_'+k: v for k, v in vs.items()}

Readability features: word count, avg sentence length, avg word length, Gunning Fog, SMOG, Flesch-Kincaid. LIWC can give language features, but again is proprietary.

TODO: COCA fluency

In [6]:
import string, re
from collections import Counter

exclude = list(string.punctuation)

def char_count(text):
    text = text.replace(' ', '')
    return len(text)

def sentence_count(text):
    ignoreCount = 0
    sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)
    for sentence in sentences:
        if not sentence:
            ignoreCount += 1
    return max(1, len(sentences) - ignoreCount)

def read_features(body):
    d = defaultdict(float)
    cc = char_count(body)
    sc = sentence_count(body)
    words = ''.join(ch for ch in body if ch not in exclude).split()
    d['WC'] = len(words)
    d['SL'] = d['WC'] / sc
    d['WL'] = cc / d['WC'] if d['WC'] > 0 else 0
    type_count = Counter(words)
    d['ttr'] = len(type_count) / float(sum(type_count.values())) if d['WC'] > 0 else 0
#     d['GI'] = textstat.gunning_fog(body)
#     d['SMOG'] = textstat.smog_index(body)
#     d['FK'] = textstat.flesch_kincaid_grade(body)
    return d

In [7]:
def all_features(comment):
    analyzer = SentimentIntensityAnalyzer()
    body = comment.body
    
    features = {
        'score': float(comment.score),
    }
    
    features.update(vader_features(analyzer, body))
    features.update(read_features(body))
    
    return pd.Series(features)

In [8]:
featurized = df.iloc[:20].apply(all_features, axis=1)

with pd.option_context('display.max_colwidth', 500, 'display.max_columns', 100):
    display(featurized)

Unnamed: 0,SL,WC,WL,score,ttr,vad_compound,vad_neg,vad_neu,vad_pos
0,13.833333,249.0,4.578313,3.0,0.690763,0.6769,0.127,0.733,0.139
1,20.0,20.0,3.85,2.0,0.95,0.25,0.098,0.714,0.188
2,9.0,45.0,4.444444,1.0,0.888889,0.5171,0.035,0.819,0.147
3,10.0,10.0,3.6,85.0,1.0,0.0,0.0,1.0,0.0
4,4.588235,78.0,7.884615,1.0,0.923077,0.9274,0.0,0.843,0.157
5,12.666667,38.0,4.605263,5.0,0.973684,0.5574,0.074,0.766,0.16
6,6.666667,40.0,4.95,1.0,0.875,-0.9175,0.292,0.614,0.094
7,10.0,30.0,4.633333,1.0,0.9,0.765,0.07,0.698,0.233
8,12.428571,87.0,5.0,8.0,0.873563,0.7954,0.037,0.848,0.115
9,17.0,34.0,4.735294,1.0,0.852941,0.8126,0.0,0.749,0.251


In [None]:
featurized = df.apply(all_features, axis=1).to_dict('index')

TODO: featurize each comment, make list of dicts, use sklearn.feature_extraction.DictVectorizer

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction import DictVectorizer

v = DictVectorizer(sparse=False)

v.fit(featurized.values())
X = v.transform(featurized.values())
y = df['controversiality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

gnb = GaussianNB()
model = gnb.fit(X_train, y_train)
print('Score:', model.score(X_test, y_test))

Score: 0.94438559322


In [18]:
from imblearn.over_sampling import SMOTE

X_resample, y_resample = SMOTE().fit_sample(X, y)
X_retrain, X_retest, y_retrain, y_retest = train_test_split(X_resample, y_resample, test_size=0.2)
gnb_re = GaussianNB()
model_re = gnb_re.fit(X_retrain, y_retrain)
print('Oversample Score:', model_re.score(X_retest, y_retest))

Oversample Score: 0.546316359697
