In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler, LabelEncoder, PowerTransformer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, f1_score
from sklearn.metrics import roc_auc_score, precision_recall_curve, average_precision_score, plot_precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn import (datasets, model_selection, feature_extraction, linear_model)
import nltk  
import random  
import string
import bs4 as bs  
import urllib.request  
import re 
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(2018)
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Iris
[nltk_data]     Limani\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [35]:
train_path = '/Users/Iris Limani/Documents/Asigmo Program/Hackathon/train.csv'
df = pd.read_csv(train_path, sep=',', encoding="UTF-8")

In [36]:
df = df[df['humor_controversy'].notna()]
df

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating
0,1,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.20
1,2,A man inserted an advertisement in the classif...,1,2.50,1.0,1.10
2,3,How many men does it take to open a can of bee...,1,1.95,0.0,2.40
3,4,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.00
4,5,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.10
...,...,...,...,...,...,...
7991,7992,"Sins are like viruses, it's better you keep th...",1,2.13,1.0,0.10
7993,7994,My daughter wanted a Cinderella themed birthda...,1,3.70,0.0,0.00
7996,7997,Why are aspirins white? Because they work sorry,1,1.33,0.0,3.85
7997,7998,"Today, we Americans celebrate our independence...",1,2.55,0.0,0.00


In [37]:
print(len(df))

4932


In [38]:
#perform lemmatize and stem preprocessing steps on the data set.

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [39]:
doc_sample = df[df['id'] == 1].values[0][1]
doc_sample

"TENNESSEE: We're the best state. Nobody even comes close. *Elevennessee walks into the room* TENNESSEE: Oh shit..."

In [40]:
# Select a document to preview after preprocessing.
stemmer = SnowballStemmer('english')
doc_sample = df[df['id'] == 1].values[0][1]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['TENNESSEE:', "We're", 'the', 'best', 'state.', 'Nobody', 'even', 'comes', 'close.', '*Elevennessee', 'walks', 'into', 'the', 'room*', 'TENNESSEE:', 'Oh', 'shit...']


 tokenized and lemmatized document: 
['tennesse', 'best', 'state', 'come', 'close', 'elevennesse', 'walk', 'room', 'tennesse', 'shit']


In [41]:
#Preprocess the text, saving the results as ‘processed_docs’
processed_docs = df['text'].map(preprocess)
processed_docs[:20]

0     [tennesse, best, state, come, close, elevennes...
1     [insert, advertis, classifi, wife, want, recei...
2                [open, beer, open, time, bring, couch]
3     [tell, twitter, follow, point, brother, own, h...
4     [rose, dead, love, fake, wed, basic, funer, cake]
7     [origin, truli, kind, hold, glass, white, wine...
11    [differ, mormon, muslim, mormon, get, virgin, ...
12    [stop, call, toilet, paper, run, toilet, paper...
13    [march, street, shout, peopl, civil, disobedi,...
17    [sentenc, creepier, add, consent, consent, adu...
19    [shrink, make, space, atom, smaller, subatom, ...
20    [say, pube, get, long, erect, look, like, pino...
23           [girlfriend, like, squar, root, imaginari]
25                [wife, leav, say, meal, intend, word]
26    [chines, select, babi, name, chuck, stair, pin...
27    [relief, go, doctor, look, larg, mole, chest, ...
28                     [father, trust, fact, say, tell]
29                 [specimen, mama, italian, ast

In [54]:
#Bag of Words on the Data set
#Bag of Words on the Data set
#Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    #if count > 10:
        #break

0 best
1 close
2 come
3 elevennesse
4 room
5 shit
6 state
7 tennesse
8 walk
9 advertis
10 avail
11 classifi
12 deliveri
13 door
14 free
15 insert
16 read
17 receiv
18 repli
19 step
20 want
21 wife
22 beer
23 bring
24 couch
25 open
26 time
27 agenc
28 brother
29 collect
30 follow
31 hous
32 own
33 point
34 tell
35 twitter
36 basic
37 cake
38 dead
39 fake
40 funer
41 love
42 rose
43 wed
44 glass
45 hold
46 kind
47 laugh
48 origin
49 truli
50 white
51 wine
52 differ
53 get
54 kill
55 mormon
56 muslim
57 virgin
58 call
59 colbert
60 dial
61 number
62 paper
63 run
64 stephen
65 stop
66 toilet
67 civil
68 disobedi
69 drink
70 hour
71 involuntari
72 march
73 peopl
74 shout
75 street
76 add
77 adult
78 consent
79 creepier
80 kayak
81 rid
82 sentenc
83 tandem
84 atom
85 fuck
86 make
87 shrink
88 shut
89 smaller
90 space
91 stall
92 subatom
93 erect
94 go
95 join
96 like
97 long
98 look
99 pinocchio
100 pube
101 say
102 taliban
103 girlfriend
104 imaginari
105 root
106 squar
107 intend
108 leav


In [55]:
#Filter out tokens that appear in
#less than 15 documents (absolute number) or
#more than 0.5 documents (fraction of total corpus size, not absolute number).
#after the above two steps, keep only the first 100000 most frequent tokens.

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [56]:
#Gensim doc2bow
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(45, 1), (86, 1), (357, 1), (456, 1)]

In [57]:
#Preview Bag Of Words for our sample preprocessed document.

bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 45 ("peopl") appears 1 time.
Word 86 ("think") appears 1 time.
Word 357 ("straight") appears 1 time.
Word 456 ("drive") appears 1 time.


In [58]:
#TF-IDF

from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.338544985798022),
 (1, 0.45524153615456625),
 (2, 0.28825041494690795),
 (3, 0.3942779611201561),
 (4, 0.37438265976636786),
 (5, 0.45524153615456625),
 (6, 0.30363101263607206)]


In [59]:
#Running LDA using Bag of Words
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [60]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.042*"peopl" + 0.025*"say" + 0.024*"get" + 0.023*"mean" + 0.020*"hear" + 0.018*"know" + 0.018*"tell" + 0.017*"black" + 0.017*"women" + 0.015*"want"
Topic: 1 
Words: 0.065*"say" + 0.037*"ask" + 0.030*"peopl" + 0.029*"girlfriend" + 0.026*"want" + 0.026*"talk" + 0.022*"know" + 0.020*"friend" + 0.017*"tell" + 0.016*"wife"
Topic: 2 
Words: 0.061*"like" + 0.024*"go" + 0.023*"know" + 0.016*"phone" + 0.014*"guy" + 0.014*"look" + 0.014*"good" + 0.014*"wear" + 0.013*"say" + 0.013*"time"
Topic: 3 
Words: 0.059*"like" + 0.051*"black" + 0.031*"peopl" + 0.028*"year" + 0.014*"friend" + 0.013*"woman" + 0.013*"girl" + 0.013*"night" + 0.012*"wife" + 0.011*"look"
Topic: 4 
Words: 0.035*"like" + 0.032*"wife" + 0.016*"hard" + 0.016*"today" + 0.013*"love" + 0.013*"jewish" + 0.013*"guess" + 0.011*"peopl" + 0.011*"call" + 0.011*"know"
Topic: 5 
Words: 0.036*"say" + 0.030*"tell" + 0.026*"time" + 0.019*"hand" + 0.017*"right" + 0.016*"watch" + 0.016*"know" + 0.013*"live" + 0.013*"like" + 0.012*

In [61]:
#Running LDA using TF-IDF

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.022*"walk" + 0.022*"women" + 0.018*"look" + 0.017*"say" + 0.016*"doctor" + 0.014*"like" + 0.014*"feel" + 0.012*"bodi" + 0.012*"give" + 0.011*"wife"
Topic: 1 Word: 0.024*"think" + 0.017*"feminist" + 0.017*"muslim" + 0.016*"common" + 0.014*"babi" + 0.014*"leav" + 0.013*"dead" + 0.012*"wife" + 0.011*"mean" + 0.011*"like"
Topic: 2 Word: 0.035*"like" + 0.019*"know" + 0.018*"differ" + 0.016*"kid" + 0.014*"black" + 0.013*"drink" + 0.013*"kill" + 0.012*"think" + 0.012*"want" + 0.011*"wife"
Topic: 3 Word: 0.021*"go" + 0.018*"say" + 0.016*"like" + 0.015*"wife" + 0.014*"santa" + 0.013*"tri" + 0.012*"need" + 0.010*"fuck" + 0.010*"look" + 0.010*"glass"
Topic: 4 Word: 0.017*"corni" + 0.017*"wife" + 0.016*"black" + 0.015*"tell" + 0.012*"money" + 0.012*"like" + 0.011*"night" + 0.011*"come" + 0.011*"lose" + 0.011*"live"
Topic: 5 Word: 0.027*"say" + 0.017*"ask" + 0.017*"know" + 0.016*"peopl" + 0.015*"call" + 0.014*"girlfriend" + 0.013*"friend" + 0.013*"like" + 0.012*"wife" + 0.011*"blac

In [62]:
processed_docs[:4310]

0       [tennesse, best, state, come, close, elevennes...
1       [insert, advertis, classifi, wife, want, recei...
2                  [open, beer, open, time, bring, couch]
3       [tell, twitter, follow, point, brother, own, h...
4       [rose, dead, love, fake, wed, basic, funer, cake]
                              ...                        
6970    [vietnam, open, restaur, call, viet, nom, ask,...
6971      [romant, power, go, listen, grind, beef, throw]
6972    [american, border, patrol, guard, xanax, stop,...
6973             [hear, watermelon, bandit, littl, seedi]
6974                                [sexual, activ, kill]
Name: text, Length: 4310, dtype: object

In [63]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8199759721755981	 
Topic: 0.032*"think" + 0.029*"peopl" + 0.023*"walk" + 0.021*"need" + 0.016*"turn" + 0.013*"look" + 0.013*"blond" + 0.012*"drive" + 0.012*"differ" + 0.011*"life"

Score: 0.020005855709314346	 
Topic: 0.042*"peopl" + 0.025*"say" + 0.024*"get" + 0.023*"mean" + 0.020*"hear" + 0.018*"know" + 0.018*"tell" + 0.017*"black" + 0.017*"women" + 0.015*"want"

Score: 0.020004572346806526	 
Topic: 0.065*"say" + 0.037*"ask" + 0.030*"peopl" + 0.029*"girlfriend" + 0.026*"want" + 0.026*"talk" + 0.022*"know" + 0.020*"friend" + 0.017*"tell" + 0.016*"wife"

Score: 0.020003626123070717	 
Topic: 0.059*"like" + 0.051*"black" + 0.031*"peopl" + 0.028*"year" + 0.014*"friend" + 0.013*"woman" + 0.013*"girl" + 0.013*"night" + 0.012*"wife" + 0.011*"look"

Score: 0.02000228501856327	 
Topic: 0.061*"like" + 0.024*"go" + 0.023*"know" + 0.016*"phone" + 0.014*"guy" + 0.014*"look" + 0.014*"good" + 0.014*"wear" + 0.013*"say" + 0.013*"time"

Score: 0.020001839846372604	 
Topic: 0.036*"say" + 0.03

In [64]:
#Performance evaluation by classifying sample document using LDA TF-IDF model.

for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8199639916419983	 
Topic: 0.035*"like" + 0.019*"know" + 0.018*"differ" + 0.016*"kid" + 0.014*"black" + 0.013*"drink" + 0.013*"kill" + 0.012*"think" + 0.012*"want" + 0.011*"wife"

Score: 0.020008286461234093	 
Topic: 0.024*"think" + 0.017*"feminist" + 0.017*"muslim" + 0.016*"common" + 0.014*"babi" + 0.014*"leav" + 0.013*"dead" + 0.012*"wife" + 0.011*"mean" + 0.011*"like"

Score: 0.0200059674680233	 
Topic: 0.029*"peopl" + 0.021*"joke" + 0.014*"tell" + 0.014*"german" + 0.014*"thing" + 0.014*"make" + 0.013*"come" + 0.013*"say" + 0.012*"black" + 0.011*"like"

Score: 0.020004520192742348	 
Topic: 0.019*"know" + 0.019*"hear" + 0.016*"head" + 0.013*"girlfriend" + 0.012*"break" + 0.011*"give" + 0.010*"take" + 0.010*"blond" + 0.009*"bring" + 0.009*"deaf"

Score: 0.02000393159687519	 
Topic: 0.027*"say" + 0.017*"ask" + 0.017*"know" + 0.016*"peopl" + 0.015*"call" + 0.014*"girlfriend" + 0.013*"friend" + 0.013*"like" + 0.012*"wife" + 0.011*"black"

Score: 0.02000373788177967	 
Topic: 0.02

In [65]:
unseen_document = 'how to keep the flies off the bride at an italian wedding keep a bucket of shit next to her'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.8199540376663208	 Topic: 0.047*"tell" + 0.030*"say" + 0.020*"wife" + 0.019*"like" + 0.018*"come"
Score: 0.020009970292448997	 Topic: 0.042*"peopl" + 0.025*"say" + 0.024*"get" + 0.023*"mean" + 0.020*"hear"
Score: 0.020007777959108353	 Topic: 0.044*"say" + 0.025*"doctor" + 0.021*"wife" + 0.018*"think" + 0.017*"go"
Score: 0.02000679261982441	 Topic: 0.060*"wife" + 0.020*"differ" + 0.015*"want" + 0.015*"leav" + 0.014*"person"
Score: 0.02000550739467144	 Topic: 0.036*"say" + 0.030*"tell" + 0.026*"time" + 0.019*"hand" + 0.017*"right"
Score: 0.020004762336611748	 Topic: 0.035*"like" + 0.032*"wife" + 0.016*"hard" + 0.016*"today" + 0.013*"love"
Score: 0.020003441721200943	 Topic: 0.059*"like" + 0.051*"black" + 0.031*"peopl" + 0.028*"year" + 0.014*"friend"
Score: 0.02000303752720356	 Topic: 0.061*"like" + 0.024*"go" + 0.023*"know" + 0.016*"phone" + 0.014*"guy"
Score: 0.020002475008368492	 Topic: 0.065*"say" + 0.037*"ask" + 0.030*"peopl" + 0.029*"girlfriend" + 0.026*"want"
Score: 0.02000