In [2]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import random
import re

In [3]:
comments = pd.read_csv('data/train-balanced-sarcasm.csv')

In [4]:
subreddit_counts = comments['subreddit'].value_counts()
comments_counts = comments.join(subreddit_counts, on='subreddit', rsuffix='_count')
comments_counts = comments_counts[comments_counts.subreddit_count > 8000]
subreddit_data = pd.get_dummies(comments_counts['subreddit'], prefix='r',sparse=True)
display(comments_counts['label'].value_counts())
# 1    183069
# 0    169832
# close enough!

1    183069
0    169832
Name: label, dtype: int64

In [None]:
# Drop everything except for the subreddit dummy values and the up/down scores and the labels
comments_test = comments_counts.drop(['comment', 'subreddit', 'author', 'date', 'created_utc', 'parent_comment','score','subreddit_count'], axis=1)
dummy_comments = pd.concat([comments_test, subreddit_data], axis=1)

# Take absolute value of each score since NBC's don't like negative numbers
dummy_comments['ups'] = abs(dummy_comments['ups'])
dummy_comments['downs'] = abs(dummy_comments['downs'])

X = dummy_comments.drop('label', axis=1).values
y = dummy_comments['label'].values

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import MultinomialNB

f_scores_0 = []
f_scores_1 = []

for _ in range(20):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
    
    f_scores_0.append(f[0])
    f_scores_1.append(f[1])

display(sum(f_scores_0) / float(len(f_scores_0)))
display(sum(f_scores_1) / float(len(f_scores_1)))

In [None]:
# Drop everything except for the subreddit dummy values and the labels
comments_test_no_scores = comments_counts.drop(['comment', 'subreddit', 'author', 'date', 'created_utc', 'parent_comment','score','subreddit_count','ups','downs'], axis=1)
dummy_comments_no_scores = pd.concat([comments_test_no_scores, subreddit_data], axis=1)

X = dummy_comments_no_scores.drop('label', axis=1).values
y = dummy_comments_no_scores['label'].values

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import MultinomialNB

f_scores_0_no_scores = []
f_scores_1_no_scores = []

for _ in range(20):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
    
    f_scores_0_no_scores.append(f[0])
    f_scores_1_no_scores.append(f[1])
    
display(sum(f_scores_0_no_scores) / float(len(f_scores_0_no_scores)))
display(sum(f_scores_1_no_scores) / float(len(f_scores_1_no_scores)))

In [7]:
from sklearn import preprocessing

alphabet = 'abcdefghijklmnopqrstuvwxyz1234567890'
uppercaseAlphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
punctuation = '*\"\(\)~:/&\'.-#[];_^$\{\}=!+?@%`,|\x08'
patterns = ['...']
alphabet += punctuation
le = preprocessing.LabelEncoder()
le.fit([l for l in alphabet])

def comment_features_word(comment):
    features = []

    # Letter counts
    for letter in alphabet:
        features.append(comment.lower().count(letter))
        
    # Length
    features.append(len(comment))
    
    # Presence of punctuation
    punctPresence = False
    for p in punctuation:
        punctPresence = punctPresence or p in comment
    features.append(punctPresence)
    
    # Average word length
    commentWords = re.sub("[^\w]", " ",  comment).split()
    a = sum([len(word) for word in commentWords]) / len(commentWords) if len(commentWords) > 0 else 0
    features.append(a)
    
    # Words used
    a = []
    for word in topWords:
        a.append(comment.lower().count(word))
    features.extend(a)
    
    # Checking for predefined patterns
    a = []
    for pattern in patterns:
        a.append(comment.count(pattern))
    features.extend(a)
    
    # Checking for number of uppercase letters
    a = 0
    for letter in uppercaseAlphabet:
        a += comment.count(letter)
    features.append(a)
    
    
        
    return features

In [None]:
# Drop everything except for the subreddit dummy values and the labels
comments_test_full = comments_counts.drop(['subreddit', 'author', 'date', 'created_utc', 'parent_comment','score','subreddit_count','ups','downs'], axis=1)
comments_test_full = pd.concat([comments_test_full, subreddit_data], axis=1)

X = [comment_features_word]

X = comments_test_full.drop('label', axis=1).values
y = comments_test_full['label'].values

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import MultinomialNB

f_scores_0_no_scores = []
f_scores_1_no_scores = []

for _ in range(20):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
    
    f_scores_0_no_scores.append(f[0])
    f_scores_1_no_scores.append(f[1])
    
display(sum(f_scores_0_no_scores) / float(len(f_scores_0_no_scores)))
display(sum(f_scores_1_no_scores) / float(len(f_scores_1_no_scores)))