In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model,model_selection,preprocessing
import os
import scipy
import re                                  # library for regular expression operations
import string                              # for string operations
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
import matplotlib.pyplot as plt

# Funtions

In [2]:
def sigmoid(x):
    h = 1/(1+np.exp(-x))
    return h

In [3]:
def GradDesc(x,y,theta,alpha,reg_lambda = 0.001,num_iter = 100):
    m = x.shape[0]
    J_y = []
    J_x = range(num_iter)
    for i in range(num_iter):
        z = np.dot(x,theta)
        h = sigmoid(z)
        J = -(np.dot(y.T,np.log(h)) + np.dot((1-y).T,np.log(1-h)))/m + reg_lambda*np.dot(theta.T,theta)/m
        theta = theta - alpha * np.dot(x.T,(h-y))/m + alpha*reg_lambda*theta/m
        J_y.append(float(J))
    plt.plot(J_x,np.array(J_y))
    plt.show()
    J = float(J)
    return J,theta

In [4]:
def PrepText(text):
    
    # remove old style retweet text "RT"
    text = text.replace('\n','')
    text = str(text.encode('ascii','replace'))[2:]
    text = re.sub(r'^RT[\s]+', '', text)
    # remove hyperlinks
    text = re.sub(r'https?://[^\s\n\r]+', '', text)
    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    # instantiate tokenizer class
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                                   reduce_len=True)
    # tokenize tweets
    tweet_tokens = tokenizer.tokenize(text)
    stopwords_english = stopwords.words('english')
    
    clean_tweet = []

    for word in tweet_tokens: # Go through every word in your tokens list
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            clean_tweet.append(word)
            
    # Instantiate stemming class
    stemmer = PorterStemmer() 

    # Create an empty list to store the stems
    tweet_fin = [] 

    for word in clean_tweet:
        stem_word = stemmer.stem(word)  # stemming word
        tweet_fin.append(stem_word)  # append to the list        
            
    return tweet_fin
    

In [5]:
def BuildFreqs(tweets, y):

    ylist = np.squeeze(y).tolist()
    freqs = {}
    for y, tweet in zip(ylist, tweets):
        for word in PrepText(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [6]:
def ExtFeats(tweet, freqs, PrepText=PrepText):

    word_l = PrepText(tweet)
    x = np.zeros((1, 3)) 
    x[0,0] = 1 # Default Bias

    for word in word_l:
        try:
            x[0,1] += freqs[(word,1.0)]      
        except:
            continue
        try:
            x[0,2] += freqs[(word,0.0)]      
        except:
            continue
        
    return x

In [7]:
def PredictProba(tweet, freqs, theta):

    x = ExtFeats(tweet,freqs)    
    y_pred = sigmoid(np.dot(x,theta))
    
    return y_pred

# Data Development

In [8]:
# Importing Development Data
Dev = pd.read_csv(r'/kaggle/input/nlp-getting-started/train.csv',encoding='utf-8')
Dev.shape

In [9]:
#Visualising Positive and Negative Tweets:
cookies = np.array([Dev.target.sum(),Dev.target.count() - Dev.target.sum()])
plt.pie(cookies,labels = ['positive','negative'])
plt.show()

In [10]:
# Train-Test Split:
Dev_X = Dev[['text']]
Dev_Y = Dev[['target']]
Train_X, Test_X, Train_y, Test_y = model_selection.train_test_split(Dev_X,Dev_Y,test_size = 0.3,random_state = 1)
Train_X = Train_X.reset_index(drop = True)
Train_X = list(Train_X.iloc[:,0])
Test_X = Test_X.reset_index(drop = True)
Test_X = list(Test_X.iloc[:,0])
Train_y = Train_y.reset_index(drop = True)
Train_y = Train_y.to_numpy()
Test_y = Test_y.reset_index(drop = True)
Test_y = Test_y.to_numpy()
print("Shape Of The Train Data: ",len(Train_X)," Shape Of The Test Data: ",len(Test_X))

# Creating A Dictionary With Frequencies  

In [11]:
freqs = BuildFreqs(Train_X,Train_y)

# Training The Model

In [12]:
X = np.zeros((len(Train_X), 3))
for i in range(len(Train_X)):
    X[i, :]= ExtFeats(Train_X[i], freqs)
    
Y = Train_y

J, theta = GradDesc(X, Y, theta = np.zeros((3, 1)),alpha = 1e-9,reg_lambda = 0,num_iter = 5000)

# Prediction

In [13]:
y_hat = []

for tweet in Test_X:
    y_pred = PredictProba(tweet, freqs, theta)
    
    if y_pred > 0.5:
        y_hat.append(1.0)
    else:
        y_hat.append(0.0)

In [14]:
a = np.array(y_hat) == Test_y.T
accuracy = a[0].sum()/len(a[0])
print("Accuracy of The Model Is : ",np.round(accuracy*100,2), "%")

# Model For Submission

In [15]:
submission_data = pd.read_csv(r'/kaggle/input/nlp-getting-started/test.csv',encoding='utf-8')

In [16]:
submission_data_X = submission_data[['text']]
submission_data_X = submission_data_X.reset_index(drop = True)
submission_data_X = list(submission_data_X.iloc[:,0])

In [17]:
Dev_X = Dev[['text']]
Dev_Y = Dev[['target']]

Dev_X = Dev_X.reset_index(drop = True)
Dev_X = list(Dev_X.iloc[:,0])

Dev_Y = Dev_Y.reset_index(drop = True)
Dev_Y = Dev_Y.to_numpy()

freqs_depl = BuildFreqs(Dev_X,Dev_Y)

In [18]:
X_depl = np.zeros((len(Dev_X), 3))
for i in range(len(Dev_X)):
    X_depl[i, :]= ExtFeats(Dev_X[i], freqs_depl)
    
Y_depl = Dev_Y

J_depl, theta_depl = GradDesc(X_depl, Y_depl, theta = np.zeros((3, 1)),alpha = 1e-9,reg_lambda = 0,num_iter = 5000)

y_hat_depl = []

for tweet in submission_data_X:
    y_pred = PredictProba(tweet, freqs_depl, theta_depl)
    
    if y_pred > 0.5:
        y_hat_depl.append(1.0)
    else:
        y_hat_depl.append(0.0)

In [20]:
sub_data = pd.concat([submission_data,pd.DataFrame(y_hat_depl,columns = ['target'])],axis = 1)
sub_data.target = sub_data.target.astype('int64')
sub_data[['id','target']].to_csv('submission.csv',index = False)