In [1]:
import pandas as pd
# import seaborn as sns
import numpy as np
# import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import sent_tokenize

# Bag of words
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

### Training data

In [2]:
data_path = '../data/'
rdfTrain = pd.read_csv(data_path + 'train.tsv', sep = '\t', header = None, names = ['text', 'label', 'id'])
rdfTrain.head()

Unnamed: 0,text,label,id
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj


### Dev data

In [3]:
# validation set
data_path = '../data/'
rdfDev = pd.read_csv(data_path + 'dev.tsv', sep = '\t', header = None, names = ['text', 'label', 'id'])
rdfDev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5426 entries, 0 to 5425
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5426 non-null   object
 1   label   5426 non-null   object
 2   id      5426 non-null   object
dtypes: object(3)
memory usage: 127.3+ KB


#### Neutral/Non-neutral

In [4]:
def label_neutral(row):
    if row['label'] == '27':
        return 1
    else:
        return 0

In [5]:
def transformData(rdfTrain, rdfDev, n_categories = 2):
    
    if n_categories == 2:
        dfTrain, dfDev = rdfTrain, rdfDev
        dfTrain["neutral"] = dfTrain.apply(lambda row: label_neutral(row), axis = 1)
        dfDev["neutral"] = dfDev.apply(lambda row: label_neutral(row), axis = 1)
        print("Training distribution: ", dfTrain.neutral.value_counts())
        print("Dev data distribution: ", dfDev.neutral.value_counts())
        
    return dfTrain, dfDev

#### Splitting data

In [6]:
def splitData(trainFeatures, devFeatures, dfTrain, dfDev):
    xTrain, yTrain = trainFeatures, dfTrain['neutral']
    xDev, yDev = devFeatures, dfDev['neutral']
    
#     print("Dev - ", xDev.shape, yDev.shape)
#     print("Train - ", xTrain.shape, yTrain.shape)
    
    return xTrain, yTrain, xDev, yDev

#### Feature Generation - TFID & Bag of words

In [7]:
def featureGeneration(dfTrain, dfDev, method = 'CountVectorizer'):
    if method == 'CountVectorizer':
        #tokenizer to remove unwanted elements from out data like symbols and numbers
        token = RegexpTokenizer(r'[a-zA-Z0-9]+')
        cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)
        trainFeatures = cv.fit_transform(dfTrain['text'])
        devFeatures = cv.transform(dfDev['text'])
    
    if method == 'TF-IDF':
        tf = TfidfVectorizer()
        trainFeatures = tf.fit_transform(dfTrain['text'])
        devFeatures = tf.transform(dfDev['text'])
        
    return trainFeatures, devFeatures

#### Modelling - Naive Bayes

In [8]:
def naiveBayes(xTrain, yTrain, xDev, yDev):
    # Model Generation Using Multinomial Naive Bayes
    clf = MultinomialNB().fit(xTrain, yTrain)
    predicted= clf.predict(xDev)
    print("MultinomialNB Accuracy:", metrics.accuracy_score(yDev, predicted))

In [9]:
def model(xTrain, yTrain, xDev, yDev, method = 'NaiveBayes'):
    
    if method == 'NaiveBayes':
        naiveBayes(xTrain, yTrain, xDev, yDev)

#### Testing

In [10]:
dfTrain, dfDev = transformData(rdfTrain, rdfDev, n_categories = 2)
trainFeatures, devFeatures = featureGeneration(dfTrain, dfDev, method = 'CountVectorizer')
xTrain, yTrain, xDev, yDev = splitData(trainFeatures, devFeatures, dfTrain, dfDev)
model(xTrain, yTrain, xDev, yDev, method = 'NaiveBayes')

Training distribution:  0    30587
1    12823
Name: neutral, dtype: int64
Dev data distribution:  0    3834
1    1592
Name: neutral, dtype: int64
MultinomialNB Accuracy: 0.7137854773313674
