In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model,model_selection,preprocessing
import os
import scipy
import re                                  # library for regular expression operations
import string                              # for string operations
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
import matplotlib.pyplot as plt

In [3]:
def PrepText(text):
    
    # remove old style retweet text "RT"
    text = text.replace('\n','')
    text = re.sub(r'^RT[\s]+', '', text)
    # remove hyperlinks
    text = re.sub(r'https?://[^\s\n\r]+', '', text)
    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    # instantiate tokenizer class
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                                   reduce_len=True)
    # tokenize tweets
    tweet_tokens = tokenizer.tokenize(text)
    stopwords_english = stopwords.words('english')
    
    clean_tweet = []

    for word in tweet_tokens: # Go through every word in your tokens list
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            clean_tweet.append(word)
            
    # Instantiate stemming class
    stemmer = PorterStemmer() 

    # Create an empty list to store the stems
    tweet_fin = [] 

    for word in clean_tweet:
        stem_word = stemmer.stem(word)  # stemming word
        tweet_fin.append(stem_word)  # append to the list        
            
    return tweet_fin


In [4]:
def BuildFreqs(tweets, y):

    ylist = np.squeeze(y).tolist()
    freqs = {}
    for y, tweet in zip(ylist, tweets):
        for word in PrepText(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [5]:
def ExtFeats(tweet, freqs, PrepText=PrepText):

    word_l = PrepText(tweet)
    x = np.zeros((1, 3)) 
    x[0,0] = 1 # Default Bias

    for word in word_l:
        try:
            x[0,1] += freqs[(word,1.0)]      
        except:
            continue
        try:
            x[0,2] += freqs[(word,0.0)]      
        except:
            continue
        
    return x

In [6]:
def LookUp(freqs, word, label):
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [7]:
def NaiveBayes(freqs,X,Y):
    
    loglikelihood = {}
    logprior = 0
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)    # calculate N_pos and N_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]

    D = len(Y)
    D_pos = Y.sum()
    D_neg = D-D_pos
    # Calculate logprior
    logprior = np.log(D_pos/D_neg)
    
    for word in vocab:
    # get the positive and negative frequency of the word
        freq_pos = LookUp(freqs,word,1)
        freq_neg = LookUp(freqs,word,0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos) - np.log(p_w_neg)

    return logprior, loglikelihood
  

In [8]:
def NaiveBayesPredict(tweet, logprior, loglikelihood):

    word_l = PrepText(tweet)
    p = 0
    p += logprior

    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]

    return p

# Importing And Splitting The Data

In [19]:
Dev_X = pd.read_csv('../input/nlp-getting-started/train.csv')[['text']]
Dev_Y = pd.read_csv('../input/nlp-getting-started/train.csv')[['target']]
Dev_X = Dev_X.reset_index(drop = True)
Dev_X_l = list(Dev_X.iloc[:,0])
Dev_Y = Dev_Y.reset_index(drop = True)
Dev_Y_l = Dev_Y.to_numpy()

In [10]:
#Visualising Positive and Negative Tweets:
a = int(Dev_Y.sum())
b = int(Dev_Y.count() - Dev_Y.sum())
cookies = np.array([a,b])
plt.pie(cookies,labels = ['positive','negative'])
plt.show()

In [11]:
Train_X, Test_X, Train_y,Test_y = model_selection.train_test_split(Dev_X,Dev_Y,test_size = 0.2, random_state = 1)
Train_X = Train_X.reset_index(drop = True)
Train_X = list(Train_X.iloc[:,0])
Test_X = Test_X.reset_index(drop = True)
Test_X = list(Test_X.iloc[:,0])
Train_y = Train_y.reset_index(drop = True)
Train_y = Train_y.to_numpy()
Test_y = Test_y.reset_index(drop = True)
Test_y = Test_y.to_numpy()
print("Shape Of The Train Data: ",len(Train_X)," Shape Of The Test Data: ",len(Test_X))

# Generate Frequency

In [12]:
freqs = BuildFreqs(Train_X,Train_y)

# Training The Model

In [13]:
logprior, loglikelihood = NaiveBayes(freqs, Train_X, Train_y)

# Predict

In [14]:
y_hats = []
for tweet in Test_X:
    # if the prediction is > 0
    if NaiveBayesPredict(tweet, logprior, loglikelihood) > 0:
        # the predicted class is 1
        y_hat_i = 1
    else:
        # otherwise the predicted class is 0
        y_hat_i = 0
    # append the predicted class to the list y_hats
    y_hats.append(y_hat_i)
error = np.mean(np.absolute(y_hats-Test_y))

    # Accuracy is 1 minus the error
accuracy = 1-error
print("Accuracy Is: ",accuracy)

# Prep. For Upload

In [41]:
deploy_X = pd.read_csv('../input/nlp-getting-started/test.csv')[['text']]
deploy_X = list(deploy_X.text)

In [44]:
freqs = BuildFreqs(Dev_X_l,Dev_Y_l)

logprior, loglikelihood = NaiveBayes(freqs, Dev_X_l, Dev_Y_l)

y_hats = []
for tweet in Dev_X_l:
    # if the prediction is > 0
    if NaiveBayesPredict(tweet, logprior, loglikelihood) > 0:
        # the predicted class is 1
        y_hat_i = 1
    else:
        # otherwise the predicted class is 0
        y_hat_i = 0
    # append the predicted class to the list y_hats
    y_hats.append(y_hat_i)

y_hats_depl = []
for tweet in deploy_X:
    # if the prediction is > 0
    if NaiveBayesPredict(tweet, logprior, loglikelihood) > 0:
        # the predicted class is 1
        y_hat_i_depl = 1
    else:
        # otherwise the predicted class is 0
        y_hat_i_depl = 0
    # append the predicted class to the list y_hats
    y_hats_depl.append(y_hat_i_depl)

In [50]:
res = pd.concat([pd.read_csv('../input/nlp-getting-started/test.csv')[['id']], pd.DataFrame(y_hats_depl,columns = ['target'])],axis = 1)
res.to_csv('submission.csv',index = False)