In [12]:
import numpy as np
import pandas as pd
import re 
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from scipy import optimize
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import PorterStemmer
import string
from scipy.io import loadmat
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from utils import data_splitter,text_process,build_freqs

In [21]:
df = pd.read_csv('dataset.csv')
humor_df=df[df['humor']==True]
non_humor_df=df[df['humor']==False]
test_pos = humor_df[:20000]
train_pos = humor_df[20000:]
test_neg = non_humor_df[:20000]
train_neg = non_humor_df[20000:]
test=pd.concat([test_pos,test_neg])
train=pd.concat([train_pos,train_neg])
X_train=list(train['text'])
X_train=list(test['text'])
y_train=list(train['humor'].map({True:1,False:0}))
y_test=list(test['humor'].map({True:1,False:0}))

In [2]:

X_train, X_test, y_train, y_test,train_lab,test_lab = data_splitter('dataset.csv')


In [3]:
freqs = build_freqs(X_train, train_lab)

In [4]:
def sigmoid(z):
    h = 1 / (1 + np.exp(-z))
    return h

def gradientDescent(x, y, theta, alpha, num_iters):
    m = x.shape[0]
    for i in range(0, num_iters):
    
        z = np.dot(x,theta)
        h = sigmoid(z)
        J = -1./m * (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))    

        theta = theta - (alpha/m) * np.dot(x.transpose(),(h-y))
        
  
    J = float(J)
    return J, theta


def extract_features(text, freqs):
    x = np.zeros((1, 3))
    x[0,0] = 1
    word_l = text_process(text)
    for word in word_l:
        x[0,1] += freqs.get((word, 1.0),0)
        x[0,2] += freqs.get((word, 0.0),0)
    assert(x.shape == (1, 3))
    return x

## Training 

In [5]:
X = np.zeros((len(X_train), 3))
for i in range(len(X_train)):
    X[i, :]= extract_features(X_train[i], freqs)
J, theta = gradientDescent(X, y_train, np.zeros((3, 1)), 1e-9, 1500)

print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.49099771.
The resulting vector of weights is [-6e-08, 0.00031954, -0.00038018]


In [6]:
def predict_text(text, freqs, theta):
    x = extract_features(text,freqs)
    y_pred = sigmoid(np.dot(x,theta))
    return y_pred

In [7]:
predict_text("What do you call a bee that can’t make up its mind? A maybe.", freqs, theta)

array([[0.89242283]])

In [8]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    y_hat = []
    for text in test_x:
        y_pred = predict_text(text, freqs, theta)
        if y_pred > 0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)
            
    accuracy = (y_hat==np.squeeze(test_y)).sum()/len(test_x)
    return accuracy

In [9]:
tmp_accuracy = test_logistic_regression(X_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.8238


In [10]:
my_text ="What do you call a bee that can’t make up its mind? A maybe."
y_hat = predict_text(my_text, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Humor text')
else: 
    print('Non_Humor text')

[[0.89242283]]
Humor text


In [11]:
my_text ="I miss you"
y_hat = predict_text(my_text, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Humor text')
else: 
    print('Non_Humor text')

[[0.4919507]]
Non_Humor text


## Using Sklearn

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)), 
    ('tfidf', TfidfTransformer()), 
    ('classifier', GridSearchCV(SVC(),param_grid={'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001],
                                                  'kernel': ['sigmoid']} ,refit=True,verbose=3))
])