In [114]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from scipy.sparse import hstack
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import textblob

#Importing NB Classifier
from sklearn.naive_bayes import MultinomialNB

#Importing logistic regression Classifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier


from sklearn import metrics
from sklearn.metrics import accuracy_score,precision_score,recall_score

In [115]:
data = pd.read_csv("train_sarcasm.csv")

In [116]:
data.shape

(91298, 3)

In [117]:
data.columns

Index([u'ID', u'tweet', u'label'], dtype='object')

In [118]:
data.head()

Unnamed: 0,ID,tweet,label
0,T000452358,b'oh yea that makes sense ',sarcastic
1,T000452359,Estas enfermedad a un cargo poltico tu como pb...,sarcastic
2,T000452360,@alleygirl2409 until i\'m and all the old men ...,sarcastic
3,T000452361,"b""@sarinas it had been chanted peacefully you ...",sarcastic
4,T000452362,"b""there's nothing like being on vacation and h...",sarcastic


In [119]:
data['label1'] = np.where(data['label']=="sarcastic",1,0)

In [120]:
data.head()

Unnamed: 0,ID,tweet,label,label1
0,T000452358,b'oh yea that makes sense ',sarcastic,1
1,T000452359,Estas enfermedad a un cargo poltico tu como pb...,sarcastic,1
2,T000452360,@alleygirl2409 until i\'m and all the old men ...,sarcastic,1
3,T000452361,"b""@sarinas it had been chanted peacefully you ...",sarcastic,1
4,T000452362,"b""there's nothing like being on vacation and h...",sarcastic,1


In [121]:
data.label1.value_counts()

1    51300
0    39998
Name: label1, dtype: int64

In [122]:
# Removing @name, b', b", ist, mar, https, &amp, \r, \n

regex1 = re.compile("@[\w_\d]*|b'|b\"|ist|https[\w]*|&amp|\r|\n", re.IGNORECASE)
data['tweet'] = [re.sub(regex1," ",doc) for doc in data['tweet']]

regex2 = re.compile("#sarcasm[\w]*",re.IGNORECASE)
data['tweet'] = [re.sub(regex2,"#sarcasm",doc) for doc in data['tweet']]

regex3 = re.compile("#hypocrisy[\w]*",re.IGNORECASE)
data['tweet'] = [re.sub(regex3,"#hypocrisy",doc) for doc in data['tweet']]

# Lemmatizing

lemmatizer = WordNetLemmatizer()

tweets = [[lemmatizer.lemmatize(word) for word in doc.split()] for doc in data['tweet']]

data['tweet'] = [' '.join(doc) for doc in tweets]

data[:5]

Unnamed: 0,ID,tweet,label,label1
0,T000452358,oh yea that make sense ',sarcastic,1
1,T000452359,Estas enfermedad a un cargo poltico tu como pb...,sarcastic,1
2,T000452360,until i\'m and all the old men will finally da...,sarcastic,1
3,T000452361,it had been chanted peacefully you can't deny ...,sarcastic,1
4,T000452362,there's nothing like being on vacation and hav...,sarcastic,1


In [123]:
#Creating character count, word count, word density, punctuation count as features

data['char_count'] = data['tweet'].apply(len)
data['word_count'] = data['tweet'].apply(lambda x: len(x.split()))
data['word_density'] = data['char_count'] / (data['word_count']+1)
data['punctuation_count'] = data['tweet'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 

data[:3]

Unnamed: 0,ID,tweet,label,label1,char_count,word_count,word_density,punctuation_count
0,T000452358,oh yea that make sense ',sarcastic,1,24,6,3.428571,1
1,T000452359,Estas enfermedad a un cargo poltico tu como pb...,sarcastic,1,70,10,6.363636,2
2,T000452360,until i\'m and all the old men will finally da...,sarcastic,1,71,14,4.733333,3


In [124]:
# Creating certain flags based on the word as features

data['sarcasm_flag'] = np.where(data['tweet'].str.contains('#sarcasm'),1,0)
data['hypocrisy_flag'] = np.where(data['tweet'].str.contains('#hypocrisy'),1,0)
data['seriously_flag'] = np.where(data['tweet'].str.contains('#seriously'),1,0)
data['not_flag'] = np.where(data['tweet'].str.contains('#not'),1,0)
data['sar_flag'] = np.where(data['tweet'].str.contains('$$SAR$$'),1,0)
data['haha_flag'] = np.where(data['tweet'].str.contains('haha'),1,0)
data[:3]

Unnamed: 0,ID,tweet,label,label1,char_count,word_count,word_density,punctuation_count,sarcasm_flag,hypocrisy_flag,seriously_flag,not_flag,sar_flag,haha_flag
0,T000452358,oh yea that make sense ',sarcastic,1,24,6,3.428571,1,0,0,0,0,0,0
1,T000452359,Estas enfermedad a un cargo poltico tu como pb...,sarcastic,1,70,10,6.363636,2,0,0,0,0,0,0
2,T000452360,until i\'m and all the old men will finally da...,sarcastic,1,71,14,4.733333,3,1,0,0,0,0,0


In [None]:
# Adding Frequency distribution of Part of Speech Tags. Takes a lot of time to run. And did not increase accuracy

pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
   
    wiki = textblob.TextBlob(x)

    for tup in wiki.tags:
        ppo = list(tup)[1]
        if ppo in pos_family[flag]:
            cnt += 1
   
    return cnt

data['noun_count'] = data['tweet'].apply(lambda x: check_pos_tag(x, 'noun'))
data['verb_count'] = data['tweet'].apply(lambda x: check_pos_tag(x, 'verb'))
data['adj_count'] = data['tweet'].apply(lambda x: check_pos_tag(x, 'adj'))
data['adv_count'] = data['tweet'].apply(lambda x: check_pos_tag(x, 'adv'))
data['pron_count'] = data['tweet'].apply(lambda x: check_pos_tag(x, 'pron'))

In [125]:
# Keeping only words, spaces

data['tweet'] = [re.sub("[^\w\s]", '', doc).lower() for doc in data['tweet']]
data['tweet'] = [re.sub(" \d+", " ", doc) for doc in data['tweet']]

# Defining set of stopwords

stopwords=['is','am','are','was','has','had','have','be','do','does','did','in','the','a','an','and','by','for',
              'of','i','im','he','she','me','you','your','they','them','this','that','these','those']

data['tweet'] = data['tweet'].map(lambda x :" ".join(word for word in word_tokenize(x) if word not in stopwords and len(word)>=3))



In [50]:
# Adding sentiment as a feature. Did not improve the performance after adding, hence not considering as a feature

sid = SentimentIntensityAnalyzer()

def vaderSentiment(sentence):
    sent_score = 0
   
    pos = neg =neu = sent_score= 0

    if sentence=='':
        return 0
    else:
        sent_score = sid.polarity_scores(sentence)
        neg = sent_score['neg']
        pos = sent_score['pos']
        neu = sent_score['neu']

        if pos>=neg and pos>=neu:
            sent_score = 1
        elif neu>=pos and neu>= neg:
            sent_score = 0
        else:
            sent_score = -1
        
    return sent_score

#data['sentiment'] = [vaderSentiment(doc) for doc in data['tweet']]

In [126]:
#Initializing the vectorizer

#count_vect = CountVectorizer(ngram_range=(1,2),max_features=25000)

#count_vect.fit(data['tweet'])

# Tfidf vectorizer performed better 

tf_idf_vect = TfidfVectorizer(analyzer='word',ngram_range=(1,2),max_features=25000)

tf_idf_vect.fit(data['tweet'])

# Characters and their ngrams as features

tfidf_vect_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_chars.fit(data['tweet'])

TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(2, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [127]:
X = data['tweet']
Y = data['label1']

In [128]:
# Splitting the data
trainX, testX, trainY, testY =  train_test_split(X, Y, test_size = .3, random_state = 150)

In [129]:
#Transforming the words to vectors
X_train_tfidf_feature = tf_idf_vect.transform(trainX)

X_test_tfidf_feature = tf_idf_vect.transform(testX)

X_train_tfidf_char_feature = tfidf_vect_chars.transform(trainX)

X_test_tfidf_char_feature = tfidf_vect_chars.transform(testX)

In [130]:
full_train_feats = hstack([X_train_tfidf_feature, np.array(data[data.index.isin(trainX.index)].iloc[:, 4:])])
full_train_feats
#len(tf_idf_vect.get_feature_names() + list(data.iloc[:, 4:].columns))

<63908x25010 sparse matrix of type '<type 'numpy.float64'>'
	with 841907 stored elements in COOrdinate format>

In [131]:
full_test_feats = hstack([X_test_tfidf_feature, np.array(data[data.index.isin(testX.index)].iloc[:, 4:])])
full_test_feats

<27390x25010 sparse matrix of type '<type 'numpy.float64'>'
	with 362006 stored elements in COOrdinate format>

In [132]:
#Initializing the classfier
NB = MultinomialNB()

#Fitting the model with the x and y values
NB.fit(full_train_feats,trainY)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [133]:
#test with Naive-Bayes
test_prediction = NB.predict(full_test_feats)

In [134]:
print  "Accuracy Score : ",accuracy_score(testY,test_prediction),\
"\nPrecision score : ",precision_score(testY,test_prediction),"\nRecall score : ",recall_score(testY,test_prediction)

Accuracy Score :  0.873932092004 
Precision score :  0.862325015216 
Recall score :  0.922455889055


In [135]:
# Performance metrics by target variable
print metrics.classification_report(testY,test_prediction)

             precision    recall  f1-score   support

          0       0.89      0.81      0.85     12031
          1       0.86      0.92      0.89     15359

avg / total       0.88      0.87      0.87     27390



In [136]:
LR = LogisticRegression()
#Fitting the model with the x and y values
LR.fit(full_train_feats,trainY)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [137]:
#test with Logistic
test_prediction_lr = LR.predict(full_test_feats)

In [138]:
print  "Accuracy Score : ",accuracy_score(testY,test_prediction_lr),\
"\nPrecision score : ",precision_score(testY,test_prediction_lr),"\nRecall score : ",recall_score(testY,test_prediction_lr)

Accuracy Score :  0.877802117561 
Precision score :  0.886287625418 
Recall score :  0.897193827723


In [139]:
# Performance metrics by target variable
print metrics.classification_report(testY,test_prediction_lr)

             precision    recall  f1-score   support

          0       0.87      0.85      0.86     12031
          1       0.89      0.90      0.89     15359

avg / total       0.88      0.88      0.88     27390



In [140]:
rf = RandomForestClassifier(class_weight='balanced',random_state = 1,n_estimators=10)
rf.fit(full_train_feats,trainY)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=1,
            verbose=0, warm_start=False)

In [141]:
#test with Random forest
test_prediction_rf = rf.predict(full_test_feats)

In [142]:
print  "Accuracy Score : ",accuracy_score(testY,test_prediction_rf),\
"\nPrecision score : ",precision_score(testY,test_prediction_rf),"\nRecall score : ",recall_score(testY,test_prediction_rf)

Accuracy Score :  0.84556407448 
Precision score :  0.883467714148 
Recall score :  0.834689758448


In [143]:
GB = GradientBoostingClassifier()
#Fitting the model with the x and y values
GB.fit(full_train_feats,trainY)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [144]:
#test with GBM
test_prediction_gbm = GB.predict(full_test_feats)

In [145]:
print  "Accuracy Score : ",accuracy_score(testY,test_prediction_gbm),\
"\nPrecision score : ",precision_score(testY,test_prediction_gbm),"\nRecall score : ",recall_score(testY,test_prediction_gbm)

Accuracy Score :  0.777327491785 
Precision score :  0.874050735175 
Recall score :  0.704407839052
