In [28]:
import pandas as pd
# import seaborn as sns
import numpy as np
# import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import sent_tokenize

# Bag of words
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Multilayer Perceptron
from sklearn.neural_network import MLPClassifier

### Training data

In [2]:
data_path = '../data/'
rdfTrain = pd.read_csv(data_path + 'train.tsv', sep = '\t', header = None, names = ['text', 'label', 'id'])
rdfTrain.head()

Unnamed: 0,text,label,id
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj


### Dev data

In [3]:
# validation set
data_path = '../data/'
rdfDev = pd.read_csv(data_path + 'dev.tsv', sep = '\t', header = None, names = ['text', 'label', 'id'])
rdfDev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5426 entries, 0 to 5425
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5426 non-null   object
 1   label   5426 non-null   object
 2   id      5426 non-null   object
dtypes: object(3)
memory usage: 127.3+ KB


#### Neutral/Non-neutral

In [4]:
def label_neutral(row):
    if row['label'] == '27':
        return 1
    else:
        return 0

## 3 categories

In [None]:
# def label_pos_neg_neutral(row):
    

In [5]:
def transformData(rdfTrain, rdfDev, n_categories = 2):
    
    dfTrain, dfDev = rdfTrain, rdfDev
    if n_categories == 2:
        dfTrain["neutral"] = dfTrain.apply(lambda row: label_neutral(row), axis = 1)
        dfDev["neutral"] = dfDev.apply(lambda row: label_neutral(row), axis = 1)
    
    if n_categories == 3:
        dfTrain["neutral"] = dfTrain.apply(lambda row: label_pos_neg_neutral(row), axis = 1)
        dfDev["neutral"] = dfDev.apply(lambda row: label_pos_neg_neutral(row), axis = 1)
    
    print("Training distribution: ", dfTrain.neutral.value_counts())
    print("Dev data distribution: ", dfDev.neutral.value_counts())
        
    return dfTrain, dfDev

#### Splitting data

In [6]:
def splitData(trainFeatures, devFeatures, dfTrain, dfDev):
    xTrain, yTrain = trainFeatures, dfTrain['neutral']
    xDev, yDev = devFeatures, dfDev['neutral']
    
#     print("Dev - ", xDev.shape, yDev.shape)
#     print("Train - ", xTrain.shape, yTrain.shape)
    
    return xTrain, yTrain, xDev, yDev

#### Feature Generation - TFID & Bag of words

In [31]:
def featureGeneration(dfTrain, dfDev, method = 'BOW'):
    if method == 'BOW':
        #tokenizer to remove unwanted elements from out data like symbols and numbers
        token = RegexpTokenizer(r'[a-zA-Z0-9]+')
        cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)
        trainFeatures = cv.fit_transform(dfTrain['text'])
        devFeatures = cv.transform(dfDev['text'])
    
    if method == 'TF-IDF':
        tf = TfidfVectorizer()
        trainFeatures = tf.fit_transform(dfTrain['text'])
        devFeatures = tf.transform(dfDev['text'])
        
    return trainFeatures, devFeatures

#### Modelling - Naive Bayes

In [8]:
def naiveBayes(xTrain, yTrain, xDev, yDev):
    # Model Generation Using Multinomial Naive Bayes
    clf = MultinomialNB().fit(xTrain, yTrain)
    predicted= clf.predict(xDev)
    print("MultinomialNB Accuracy:", metrics.accuracy_score(yDev, predicted))

In [53]:
def model(xTrain, yTrain, xDev, yDev, method = 'Naive Bayes'):
    
    if method == 'Naive Bayes':
        clf = MultinomialNB()
    
    if method == 'Decision Trees':
        clf = DecisionTreeClassifier()

    if method == 'MLP':
        clf = MLPClassifier()
        
#     if method == 'SVM':
        
        
    clf = clf.fit(xTrain, yTrain)
    predicted= clf.predict(xDev)
    
#     metrics
    acc = metrics.accuracy_score(yDev, predicted)
    precision = metrics.precision_score(yDev, predicted)
    recall = metrics.recall_score(yDev, predicted)
    roc_auc = metrics.roc_auc_score(yDev, predicted)
    f1 = metrics.f1_score(yDev, predicted)
    confusion_matrix = metrics.confusion_matrix(yDev, predicted)
#     print("Accuracy:", acc)
#     print("Precision:", precision)
#     print("Recall:", recall)
#     print("ROC AUC: ", roc_auc)
#     print("f1-score:", f1)
#     print("Confusion Matrix: \n", confusion_matrix)
    print(str(acc) + "\t" + str(precision) + "\t" + str(recall) + "\t" + str(f1) + "\t" + str(roc_auc))
    print(confusion_matrix)

    return clf

#### Testing

In [None]:
dfTrain, dfDev = transformData(rdfTrain, rdfDev, n_categories = 2)

In [32]:
trainFeatures, devFeatures = featureGeneration(dfTrain, dfDev, method = 'BOW')
xTrain, yTrain, xDev, yDev = splitData(trainFeatures, devFeatures, dfTrain, dfDev)

In [54]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'Naive Bayes')

0.7137854773313674	0.531810766721044	0.20477386934673367	0.29569160997732424	0.5649586613296005
[[3547  287]
 [1266  326]]


## Not working - Do not run

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, class_names=['0','1'])
#                 feature_names = feature_cols, 
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# graph.write_png('diabetes.png')
Image(graph.create_png())

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.datasets import load_wine
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
# load dataset
data = load_wine()

# feature matrix
X = data.data

# target vector
y = data.target

# class labels
labels = data.feature_names

# print dataset description
print(data.DESCR)
estimator = DecisionTreeClassifier()
estimator.fit(X, y)

graph = Source(tree.export_graphviz(estimator, out_file=None
   , feature_names=labels, class_names=['0', '1', '2'] 
   , filled = True))
display(SVG(graph.pipe(format='svg')))