In [35]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import nltk

# Bag of words and Tokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF
from sklearn.preprocessing import label_binarize

# Classification methods
from sklearn.naive_bayes import MultinomialNB   # Naive Bayes
from sklearn.tree import DecisionTreeClassifier  # Decision Tree
from sklearn.neural_network import MLPClassifier # Multi-Layer Perceptron
from sklearn.neighbors.classification import KNeighborsClassifier # KNN


# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from sklearn.metrics import classification_report

### Training data

In [36]:
data_path = '../data/'
rdfTrain = pd.read_csv(data_path + 'train.tsv', sep = '\t', header = None, names = ['text', 'label', 'id'])
rdfTrain.head()

Unnamed: 0,text,label,id
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj


### Dev data

In [37]:
# validation set
data_path = '../data/'
rdfDev = pd.read_csv(data_path + 'dev.tsv', sep = '\t', header = None, names = ['text', 'label', 'id'])
rdfDev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5426 entries, 0 to 5425
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5426 non-null   object
 1   label   5426 non-null   object
 2   id      5426 non-null   object
dtypes: object(3)
memory usage: 127.3+ KB


### Test data

In [38]:
# test set
data_path = '../data/'
rdfTest = pd.read_csv(data_path + 'test.tsv', sep = '\t', header = None, names = ['text', 'label', 'id'])
rdfTest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5427 entries, 0 to 5426
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5427 non-null   object
 1   label   5427 non-null   object
 2   id      5427 non-null   object
dtypes: object(3)
memory usage: 127.3+ KB


#### Neutral/Non-neutral

In [39]:
def label_neutral(row):
    if row['label'] == '27':
        return 1
    else:
        return 0

#### 3 categories: Positive, negative and Neutral

In [40]:
pos_labels = ['admiration','approval', 'amusement', 'caring', 'desire', 
              'excitement', 'gratitude', 'joy', 'love','optimism', 'pride', 'relief']
neg_labels = ['anger', 'annoyance', 'disappointment', 'disapproval', 'disgust',
              'embarrassment','fear', 'grief', 'nervousness', 'remorse', 'sadness']
ambi_labels = ['confusion', 'curiosity', 'realization', 'surprise']

In [41]:
emotion_to_idx = {
    'admiration' : '0',
    'amusement' : '1',
    'anger' : '2',
    'annoyance' : '3',
    'approval' : '4',
    'caring' : '5',
    'confusion' : '6',
    'curiosity' : '7',
    'desire' : '8',
    'disappointment' : '9',
    'disapproval' : '10',
    'disgust' : '11',
    'embarrassment' : '12',
    'excitement' : '13',
    'fear' : '14',
    'gratitude' : '15',
    'grief' : '16',
    'joy' : '17',
    'love' : '18',
    'nervousness' : '19',
    'optimism' : '20',
    'pride' : '21',
    'realization' : '22',
    'relief' : '23',
    'remorse' : '24',
    'sadness' : '25',
    'surprise' : '26',
    'neutral' : '27'
}

In [42]:
label_to_emotion = {j:i for i, j in emotion_to_idx.items()}
label_to_emotion

{'0': 'admiration',
 '1': 'amusement',
 '2': 'anger',
 '3': 'annoyance',
 '4': 'approval',
 '5': 'caring',
 '6': 'confusion',
 '7': 'curiosity',
 '8': 'desire',
 '9': 'disappointment',
 '10': 'disapproval',
 '11': 'disgust',
 '12': 'embarrassment',
 '13': 'excitement',
 '14': 'fear',
 '15': 'gratitude',
 '16': 'grief',
 '17': 'joy',
 '18': 'love',
 '19': 'nervousness',
 '20': 'optimism',
 '21': 'pride',
 '22': 'realization',
 '23': 'relief',
 '24': 'remorse',
 '25': 'sadness',
 '26': 'surprise',
 '27': 'neutral'}

In [43]:
def label_pos_neg_neutral(row):
    '''
    Sentiment Analysis label:
    0 - Negative
    1 - Positive
    2 - Neutral/Ambigous
    '''
    sentiment = [0, 0, 0]
    labels = row['label'].split(",")
    for l in labels:
        label = label_to_emotion[l] 
        if label in pos_labels:
            sentiment[1] += 1
        elif label in neg_labels:
            sentiment[0] += 1
        else:
            sentiment[2] += 1
    return np.argmax(np.array(sentiment))

In [44]:
anger_list = [ "anger", "annoyance", "disapproval", "disgust"]
fear_list = ["fear", "nervousness"]
joy_list = ["joy", "amusement", "approval", "excitement", "gratitude","love", "optimism", "relief", "pride", "admiration", "desire", "caring"]
sadness_list = ["sadness", "disappointment", "embarrassment", "grief", "remorse"]
surprise_list = ["surprise", "realization", "confusion", "curiosity"]

In [45]:
def label_emotion_group(row):
    '''
    Groupping Emotion Label:
    0 - Anger, 1 - Fear, 2- Joy, 
    3 - Sadness, 4 - Surprise, 5 - Neutral/Ambigous
    '''
    sentiment = [0]*6
    labels = row['label'].split(",")
    for l in labels:
        if l == '27': 
            return 5 # Neutral
        
        label = label_to_emotion[l]
        if label in anger_list:
            sentiment[0] += 1
        elif label in fear_list:
            sentiment[1] += 1
        elif label in joy_list:
            sentiment[2] += 1
        elif label in sadness_list:
            sentiment[3] += 1
        elif label in surprise_list:
            sentiment[4] += 1
        else:
            sentiment[5] += 1
    return np.argmax(np.array(sentiment))

In [46]:
def transformData(rdfTrain, rdfDev,rdfTest, n_categories = 2):
    
    dfTrain, dfDev, dfTest = rdfTrain, rdfDev, rdfTest
    if n_categories == 2:
        dfTrain["labels"] = dfTrain.apply(lambda row: label_neutral(row), axis = 1)
        dfDev["labels"] = dfDev.apply(lambda row: label_neutral(row), axis = 1)
        dfTest["labels"] = dfTest.apply(lambda row: label_neutral(row), axis = 1)
    
    if n_categories == 3:
        dfTrain["labels"] = dfTrain.apply(lambda row: label_pos_neg_neutral(row), axis = 1)
        dfDev["labels"] = dfDev.apply(lambda row: label_pos_neg_neutral(row), axis = 1)
        dfTest["labels"] = dfTest.apply(lambda row: label_pos_neg_neutral(row), axis = 1)
        
    if n_categories == 6:
        dfTrain["labels"] = dfTrain.apply(lambda row: label_emotion_group(row), axis = 1)
        dfDev["labels"] = dfDev.apply(lambda row: label_emotion_group(row), axis = 1)
        dfTest["labels"] = dfTest.apply(lambda row: label_emotion_group(row), axis = 1)
    
    print("Training distribution: ", dfTrain.labels.value_counts())
    print("Dev data distribution: ", dfDev.labels.value_counts())
    print("Test data distribution: ", dfTest.labels.value_counts())
        
    return dfTrain, dfDev, dfTest

In [15]:
dfTrain, dfDev, dfTest = transformData(rdfTrain, rdfDev, rdfTest, n_categories = 2)

Training distribution:  0    30587
1    12823
Name: labels, dtype: int64
Dev data distribution:  0    3834
1    1592
Name: labels, dtype: int64
Dev data distribution:  0    3834
1    1592
Name: labels, dtype: int64


In [27]:
dfTrain, dfDev, dfTest = transformData(rdfTrain, rdfDev, rdfTest, n_categories = 3)

Training distribution:  2    17021
1    16628
0     9761
Name: labels, dtype: int64
Dev data distribution:  1    2106
2    2096
0    1224
Name: labels, dtype: int64
Dev data distribution:  1    2106
2    2096
0    1224
Name: labels, dtype: int64


In [47]:
dfTrain, dfDev, dfTest = transformData(rdfTrain, rdfDev, rdfTest, n_categories = 6)

Training distribution:  2    16327
5    14219
0     5829
4     3888
3     2489
1      658
Name: labels, dtype: int64
Dev data distribution:  2    2067
5    1766
0     748
4     466
3     289
1      90
Name: labels, dtype: int64
Test data distribution:  2    1977
5    1787
0     777
4     494
3     304
1      88
Name: labels, dtype: int64


In [48]:
dfTrain = dfTrain[['text', 'labels']]
dfDev = dfDev[['text', 'labels']]
dfTest = dfTest[['text', 'labels']]

In [49]:
data_path = '../data/labelled_data/'
dfTrain.to_csv(data_path + 'train.tsv', sep='\t', header=False, index=False)
dfDev.to_csv(data_path + 'dev.tsv', sep='\t', header=False, index=False)
dfTest.to_csv(data_path + 'test.tsv', sep='\t', header=False, index=False)

#### Splitting data

In [19]:
def splitData(trainFeatures, devFeatures, dfTrain, dfDev):
    xTrain, yTrain = trainFeatures, dfTrain['labels']
    xDev, yDev = devFeatures, dfDev['labels']
    
    print("Dev : ", xDev.shape, yDev.shape)
    print("Train : ", xTrain.shape, yTrain.shape)
    
    return xTrain, yTrain, xDev, yDev

#### Feature Generation - TFID & Bag of words

In [20]:
def featureGeneration(dfTrain, dfDev, method = 'BOW'):
    if method == 'BOW':
        #tokenizer to remove unwanted elements from out data like symbols and numbers
#         token = RegexpTokenizer(r'[a-zA-Z0-9]+')
        token = TweetTokenizer(strip_handles=True, reduce_len=True)
        cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)
        trainFeatures = cv.fit_transform(dfTrain['text'])
        devFeatures = cv.transform(dfDev['text'])
    
    if method == 'TF-IDF':
        tf = TfidfVectorizer()
        trainFeatures = tf.fit_transform(dfTrain['text'])
        devFeatures = tf.transform(dfDev['text'])
        
    return trainFeatures, devFeatures

#### Modelling - Naive Bayes

In [21]:
def naiveBayes(xTrain, yTrain, xDev, yDev):
    # Model Generation Using Multinomial Naive Bayes
    clf = MultinomialNB().fit(xTrain, yTrain)
    predicted= clf.predict(xDev)
    print("MultinomialNB Accuracy:", metrics.accuracy_score(yDev, predicted))

In [34]:
def model(xTrain, yTrain, xDev, yDev, method = 'Naive Bayes'):
    
    num_classes = len(yTrain.unique())
    print("Num classes: ", num_classes)
    
    if method == 'Naive Bayes':
        clf = MultinomialNB()
    
    if method == 'Decision Trees':
        clf = DecisionTreeClassifier()

    if method == 'MLP':
        clf = MLPClassifier()
    
    if method == 'KNN':
        clf = KNeighborsClassifier(n_neighbors=5)
        
    clf = clf.fit(xTrain, yTrain)
    predicted= clf.predict(xDev)
    
#   Evaluation metrics

    Y_test = label_binarize(yDev, classes=[0, 1, 2])
    Y_score = label_binarize(predicted, classes=[0, 1, 2])
    
    
    target_names = [str(i) for i in range(num_classes)]
    print(classification_report(yDev, predicted, target_names=target_names))
    acc = metrics.accuracy_score(Y_test, Y_score)
    precision = metrics.precision_score(Y_test, Y_score, average='macro')
    recall = metrics.recall_score(Y_test, Y_score,average='macro' )
    roc_auc = metrics.roc_auc_score(Y_test, Y_score,average='macro')
    f1 = metrics.f1_score(Y_test, Y_score, average='macro')
    confusion_matrix = metrics.confusion_matrix(yDev, predicted)

    print(str(acc) + "\t" + str(precision) + "\t" + str(recall) + "\t" + str(f1) + "\t" + str(roc_auc))
    print(confusion_matrix)
        
    #     print("Accuracy:", acc)
    #     print("Precision:", precision)
    #     print("Recall:", recall)
    #     print("ROC AUC: ", roc_auc)
    #     print("f1-score:", f1)
    #     print("Confusion Matrix: \n", confusion_matrix)
    return clf

#### Testing

In [23]:
dfTrain, dfDev, dfTest = transformData(rdfTrain, rdfDev, rdfTest, n_categories = 3)

TypeError: transformData() missing 1 required positional argument: 'rdfTest'

In [29]:
trainFeatures, devFeatures = featureGeneration(dfTrain, dfDev, method = 'BOW')
xTrain, yTrain, xDev, yDev = splitData(trainFeatures, devFeatures, dfTrain, dfDev)

Dev :  (5426, 27748) (5426,)
Train :  (43410, 27748) (43410,)


In [32]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'Naive Bayes')

Num classes:  3
              precision    recall  f1-score   support

           0       0.60      0.42      0.50      1224
           1       0.67      0.78      0.72      2106
           2       0.63      0.64      0.64      2096

    accuracy                           0.64      5426
   macro avg       0.64      0.61      0.62      5426
weighted avg       0.64      0.64      0.64      5426

0.6446737928492444	0.6350305783785551	0.6134100501585409	0.6175518343474856	0.7136099701859755
[[ 515  275  434]
 [ 107 1637  362]
 [ 232  518 1346]]


In [33]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'Decision Trees')

Num classes:  3
              precision    recall  f1-score   support

           0       0.51      0.45      0.48      1224
           1       0.70      0.70      0.70      2106
           2       0.60      0.64      0.62      2096

    accuracy                           0.62      5426
   macro avg       0.60      0.60      0.60      5426
weighted avg       0.62      0.62      0.62      5426

0.6212679690379653	0.6034145840733003	0.5976367334447462	0.5997755174508467	0.7014164235538557
[[ 553  237  434]
 [ 172 1480  454]
 [ 362  396 1338]]


In [89]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'MLP')

Num classes:  3
0.643199410246959	0.6273366377374335	0.6217137250353336	0.6237179140443364	0.7189175889999854
[[ 599  228  397]
 [ 162 1559  385]
 [ 325  439 1332]]




In [90]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'KNN')

Num classes:  3
0.5366752672318467	0.5221607249073861	0.4971860890822623	0.49406711962069033	0.6275996334488428
[[ 308  227  689]
 [ 209 1078  819]
 [ 264  306 1526]]


##### Grouping Emotions:

In [99]:
dfTrain, dfDev = transformData(rdfTrain, rdfDev, n_categories = 6)

Training distribution:  2    16327
5    14219
0     5829
4     3888
3     2489
1      658
Name: class_label, dtype: int64
Dev data distribution:  2    2067
5    1766
0     748
4     466
3     289
1      90
Name: class_label, dtype: int64


In [100]:
trainFeatures, devFeatures = featureGeneration(dfTrain, dfDev, method = 'BOW')
xTrain, yTrain, xDev, yDev = splitData(trainFeatures, devFeatures, dfTrain, dfDev)

Dev :  (5426, 27748) (5426,)
Train :  (43410, 27748) (43410,)


In [101]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'Naive Bayes')

Num classes:  6
0.6295613711758201	0.7142271939498975	0.37247120454667626	0.3604982793189578	0.6225228344504737
[[ 204    0  221    4    7  312]
 [  12    1   41    1    2   33]
 [  25    0 1723    2    6  311]
 [  13    0  136   28    0  112]
 [  18    0  160    1   54  233]
 [ 102    0  604    8   28 1024]]


In [102]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'Decision Trees')

Num classes:  6
0.6420936232952451	0.5111299252808686	0.4358193046317352	0.4650373620850754	0.6711264486702068
[[ 248    7  148   29   43  273]
 [  12   24   22    3    5   24]
 [  89    7 1466   44   66  395]
 [  42    1   49  104    9   84]
 [  31    6   70    8  177  174]
 [ 202    9  365   50  146  994]]


In [103]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'MLP')

Num classes:  6
0.6653151492812385	0.5637163698203279	0.5374462919412974	0.5501905521246677	0.7256516899486872
[[ 312    9  116   22   44  245]
 [   7   44   13    2    0   24]
 [ 110   10 1460   50   53  384]
 [  31    2   44  111   13   88]
 [  37    5   70   14  143  197]
 [ 202   14  322   52  144 1032]]




In [104]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'KNN')

Num classes:  6
0.5903059343899743	0.4750822282588765	0.24938365987200614	0.29613766531485924	0.5877274797810733
[[ 133    2  143    7   21  442]
 [  11    3   14    1    4   57]
 [  90    0 1110   14   36  817]
 [  15    1   48   32   11  182]
 [  32    1   69    5   96  263]
 [ 127    0  271    9  108 1251]]


## Not working - Do not run

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, class_names=['0','1'])
#                 feature_names = feature_cols, 
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# graph.write_png('diabetes.png')
Image(graph.create_png())

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.datasets import load_wine
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
# load dataset
data = load_wine()

# feature matrix
X = data.data

# target vector
y = data.target

# class labels
labels = data.feature_names

# print dataset description
print(data.DESCR)
estimator = DecisionTreeClassifier()
estimator.fit(X, y)

graph = Source(tree.export_graphviz(estimator, out_file=None
   , feature_names=labels, class_names=['0', '1', '2'] 
   , filled = True))
display(SVG(graph.pipe(format='svg')))