## Adding the directory of the package to the system path

In [1]:
from config import *
append_path('../../')

## Importing data science libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
from m_learn.utility.model_selection import k_fold_cross_validation
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Reading the data

In [3]:
train_data = pd.read_csv("../../data/sentiment_train.txt", delimiter='\t', names  = ['sentiment', 'text'])
test_data = pd.read_csv("../../data/sentiment_test.txt", delimiter='\t', names  = ['text'])

## Feature Engineering

In [4]:
# extracting stopwords
sw = stopwords.words('english')
# creating a stemmer object
stemmer = SnowballStemmer("english")

In [5]:
def text_preprocessing(text):
    
    '''a function which carries out the neccessary text processing'''
    
    import string
    
    # remove the punctuation marks from the text
    text = str(text)
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    
    # removing the stop words from the text
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text)
    
    # stemming the text
    text = [stemmer.stem(word) for word in text.split()]
    text = " ".join(text)    
    
    # returning the processed text
    return text

In [6]:
# carrying out text preprocessing on the train set
train_data['text'] = train_data['text'].apply(text_preprocessing)
# carrying out text preprocessing on the train set
test_data['text'] = test_data['text'].apply(text_preprocessing)
# # binarizing the sentiment field on the train set
train_data['sentiment'] = train_data['sentiment'].apply(lambda x: 1 if x == 1 else -1)

## Extracting ti-id representation of the formatted message

In [7]:
# creating the object of tfid vectorizer
tfid_vectorizer = TfidfVectorizer("english")
# fitting the vectorizer using the text data set
tfid_vectorizer.fit(train_data['text'])
# extracting the tfid representation matrix of the train set
tfid_matrix_train = tfid_vectorizer.transform(train_data['text'])
# extracting the tfid representation matrix of the test set
tfid_matrix_test = tfid_vectorizer.transform(test_data['text'])

In [8]:
# collecting the tfid matrices in pandas dataframe
array = tfid_matrix_train.todense()
train_df = pd.DataFrame(array)
array = tfid_matrix_test.todense()
test_df = pd.DataFrame(array)

In [9]:
# adding the output to the training dataframe
train_df['output'] = train_data['sentiment']

## Training the best classifier

In [10]:
# features and output of the models
features = train_df.columns.tolist()
output = 'output'
features.remove(output)

In [11]:
# creating and fitting the model using training set
model = BernoulliNB()
model.fit(train_df[features], train_df[output])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [12]:
# extracting the predictions using the test set
predictions = model.predict(test_df[features])

In [13]:
# creating the submission dataframe
submission = pd.Series(predictions, name='prediction')

In [14]:
submission.head(10)

0   -1
1   -1
2    1
3    1
4    1
5   -1
6    1
7    1
8   -1
9    1
Name: prediction, dtype: int64

In [15]:
# saving the submission csv
submission.to_csv('./submission/submission.csv', index= False, header=True)