The following code is an excerpt from a project developing a multisensory humor detection model. The model below is a semantic/content based support vector machine that takes phrases as inputs and classifies them as either humorous or non-humorous. The complete project and full set of models can be found here: https://github.com/imehrlich/humor_detect/blob/master/Contextual_Humor_Detection.ipynb

In [None]:
def corpus_arrange(humor, nonhumor):
   
    # create list of phrases 
    phrase = []
    for i in range(len(humor)):
        phrase.append(humor[i][0])
    for i in range(len(nonhumor)):
        phrase.append(nonhumor[i][0])
       
    # create list of humor and non-humor labels
    classes = np.array(["humor", "nonhumor"])
    label = np.repeat(classes, [len(humor), len(nonhumor)], axis=0)

        
    # create data frame
    column_names = ["text", "label"]
    corpus = pd.DataFrame(columns = column_names)
    corpus['text'] = phrase
    corpus['label'] = label
    
    # process and prepare text for SVM input
    corpus['text'].dropna(inplace=True)
    corpus['text'] = [entry.lower() for entry in corpus['text']]
    corpus['text'] = [word_tokenize(entry) for entry in corpus['text']]
   
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    
    for index, entry in enumerate(corpus['text']):
        final_words = []
        word_lemmatized = WordNetLemmatizer()
        for word, tag in pos_tag(entry):
            if word not in stopwords.words('english') and word.isalpha():
                word_final = word_lemmatized.lemmatize(word,tag_map[tag[0]])
                final_words.append(word_final)
        corpus.loc[index,'text_final'] = str(final_words)
        
    return corpus


def content_svm(corpus):
    # fit a SVM based on content and syntax
    x_train, x_test, y_train, y_test = model_selection.train_test_split(corpus['text_final'],corpus['label'],test_size=0.2)
    
    Encoder = LabelEncoder()
    y_train = Encoder.fit_transform(y_train)
    y_test = Encoder.fit_transform(y_test)

    Tfidf_vect = TfidfVectorizer(max_features=5000)
    Tfidf_vect.fit(corpus['text_final'])
    x_train_Tfidf = Tfidf_vect.transform(x_train)
    x_test_Tfidf = Tfidf_vect.transform(x_test)

    # fit SVM
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(x_train_Tfidf, y_train)
    
    # predict
    predictions_SVM = SVM.predict(x_test_Tfidf)
    accuracy = accuracy_score(predictions_SVM, y_test)

    return accuracy
    
    
def content_svm_model(humor, nonhumor):
    # process data, run SVM model, return accuracy
    
    humor, nonhumor = len_match(humor, nonhumor)
    corpus = corpus_arrange(humor, nonhumor)
    accuracy = content_svm(corpus)     
    
    return accuracy

# run syntax SVM on humor/nonhumor data set pairings
np.random.seed(403)
content_svm_model_acc = [content_svm_model(oneliners, abc_news), content_svm_model(oneliners, proverbs),
                        content_svm_model(nycc, abc_news), content_svm_model(nycc, proverbs)]
                                                