In [93]:
import pandas as pd
import re
import warnings
import numpy as np
from matplotlib import pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from torch import nn
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader

In [94]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /Users/kishore-
[nltk_data]     pt5635/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [95]:
df = pd.read_csv("TwitterSentiment.csv", encoding="latin", header=None)

In [96]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df = df.sample(frac=0.002, ignore_index=True)
len(df)

3200

In [97]:
df.columns = ["sentiment", "tweetid", "time", "query", "userid", "tweet"]
df = df.drop(["tweetid", "time", "query", "userid"], axis=1)
df.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [98]:
df.sentiment.unique()

array([0, 4])

In [99]:
labels = {0: 0, 4: 1}
def label_encoder(label):
    return labels[label]

df.sentiment = df.sentiment.apply(lambda x : label_encoder(x))

In [100]:
df.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [101]:
import random as rand
random_list = [rand.randint(0, len(df["sentiment"])) for _ in range(10)]

df.loc[random_list,:].head(10)

Unnamed: 0,sentiment,tweet
994674,1,@spencerpratt i cannot stand you on 'The Hills...
1128256,1,Daughter's bday party yield $100+ for March of...
1098864,1,Tweetsound test - http://u.nu/74b8 #sndtwt
113994,0,Good morning everyone. It seems as though we h...
1065612,1,@socilover what's doing.. long time till you ...
555453,0,http://bit.ly/kovTh just sitting here waiting ...
1247816,1,I'm super glad to be back in Orlando...hope yo...
724616,0,@ProvenLoyalty lol this is true...hm wonder if...
648000,0,"So decide to go outside and read, and it start..."
961672,1,@Yahzarah Dang...I need a teleporter LOL Happy...


In [102]:
# Defining Regex patterns

url_pattern = "http:\S+|https:\S+|www\.\S+"
user_name_pattern = "@\S+"
non_alphabets_pattern = "[^A-Za-z]"
sequence_alphabets = f"(.)\1\1+"
replace_sequence = f"\1\1"

In [103]:
stop_words = set(stopwords.words("english"))
wordlemm = WordNetLemmatizer()
wordstemm = SnowballStemmer(language="english")

def text_preprocessing(text, lemmatize):
    tokens = []
    text = text.lower()

    text = re.sub(url_pattern, " URL", text)
    text = re.sub(user_name_pattern, " USER", text)
    text = re.sub(sequence_alphabets, replace_sequence, text)

    for word in text.split():
        if word not in stop_words:
            if lemmatize == True:
                word = wordlemm.lemmatize(word)
                tokens.append(word)
            else:
                word = wordstemm.stem(word)
                tokens.append(word)

    text =  " ".join(tokens)    
    text = re.sub(non_alphabets_pattern, " ", text)
    return text

In [104]:
def batch_preprocessing(batch_size, lemmatize=False):
    print("Stemming started") if lemmatize is False else print("Lemmatization started")
    warnings.filterwarnings("ignore")
    batch = int(len(df) / batch_size)

    batch_start = 0
    batch_end = batch
    preprocessed_df = []

    for i in range(batch_size):
        batch_data = df.iloc[batch_start : batch_end - 1]
        batch_start = batch_end
        batch_end = batch * (i + 2)
        batch_data["tweet"] = batch_data["tweet"].apply(lambda text : text_preprocessing(text, lemmatize))
        preprocessed_df.append(batch_data)
        print(f"Batch {i + 1} Completed !!")

    preprocessed_df = pd.concat(preprocessed_df)
    return preprocessed_df

In [105]:
%%time
stemmed_df = batch_preprocessing(20)

Stemming started
Batch 1 Completed !!
Batch 2 Completed !!
Batch 3 Completed !!
Batch 4 Completed !!
Batch 5 Completed !!
Batch 6 Completed !!
Batch 7 Completed !!
Batch 8 Completed !!
Batch 9 Completed !!
Batch 10 Completed !!
Batch 11 Completed !!
Batch 12 Completed !!
Batch 13 Completed !!
Batch 14 Completed !!
Batch 15 Completed !!
Batch 16 Completed !!
Batch 17 Completed !!
Batch 18 Completed !!
Batch 19 Completed !!
Batch 20 Completed !!
CPU times: user 1min 18s, sys: 2.02 s, total: 1min 20s
Wall time: 1min 20s


In [106]:
%%time
lemmatized_df = batch_preprocessing(20, True)

Lemmatization started
Batch 1 Completed !!
Batch 2 Completed !!
Batch 3 Completed !!
Batch 4 Completed !!
Batch 5 Completed !!
Batch 6 Completed !!
Batch 7 Completed !!
Batch 8 Completed !!
Batch 9 Completed !!
Batch 10 Completed !!
Batch 11 Completed !!
Batch 12 Completed !!
Batch 13 Completed !!
Batch 14 Completed !!
Batch 15 Completed !!
Batch 16 Completed !!
Batch 17 Completed !!
Batch 18 Completed !!
Batch 19 Completed !!
Batch 20 Completed !!
CPU times: user 28.6 s, sys: 856 ms, total: 29.4 s
Wall time: 29.6 s


In [107]:
stemmed_df.head()

Unnamed: 0,sentiment,tweet
0,0,user url awww that bummer shoulda got davi...
1,0,upset can t updat facebook text it might cr...
2,0,user dive mani time ball manag save rest ...
3,0,whole bodi feel itchi like fire
4,0,user no behav all i m mad here can t see t...


In [108]:
lemmatized_df.head()

Unnamed: 0,sentiment,tweet
0,0,USER URL awww that s bummer shoulda got da...
1,0,upset can t update facebook texting it migh...
2,0,USER dived many time ball managed save re...
3,0,whole body feel itchy like fire
4,0,USER no behaving all i m mad here can t se...


In [109]:
Neg_tweet_count = (stemmed_df["sentiment"] == 0).sum().item()
Pos_tweet_count = (stemmed_df["sentiment"] == 1).sum().item()
print("Negative Tweet Count :: ",Neg_tweet_count)
print("Positive Tweet Count :: ",Pos_tweet_count)


Negative Tweet Count ::  799990
Positive Tweet Count ::  799990


In [110]:
print(stemmed_df.shape)
print(lemmatized_df.shape)

(1599980, 2)
(1599980, 2)


TF - IDF For Lemmatized Data

In [111]:
lemmatized_vectorizer = TfidfVectorizer(analyzer="word", stop_words="english", max_features=1000)
X_lemmatized = lemmatized_vectorizer.fit_transform(lemmatized_df["tweet"])

In [112]:
features = X_lemmatized.toarray()

In [113]:
target = lemmatized_df["sentiment"].values.reshape(-1, 1)

In [141]:
print(features.shape)
print(target.shape)

(1599980, 1000)
(1599980, 1)


Train Test Split

In [115]:
test_size = 0.2
random_state = 40
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = test_size, random_state = random_state)

In [116]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1279984, 1000), (319996, 1000), (1279984, 1), (319996, 1))

Create DataLoader

In [117]:
class CustomDataset(Dataset):
    def __init__(self, features, target):
        self.features = features
        self.target = target

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        feature = self.features[index]
        target = self.target[index]

        feature = torch.tensor(feature, dtype=torch.float32)
        target = torch.tensor(target, dtype=torch.float32)

        return feature, target

In [118]:
trainset = CustomDataset(X_train, y_train)
testset = CustomDataset(X_test, y_test)

In [119]:
batch_size = 30
trainLoader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
testLoader = DataLoader(testset, batch_size=batch_size, shuffle=True)

Construct a Network

In [134]:
class Network(nn.Module):

    def __init__(self, inp_size, out_size, layers):
        super().__init__()

        self.layer_list = []

        for layer in layers:
            self.layer_list.append(nn.Linear(inp_size, layer))
            self.layer_list.append(nn.ReLU())
            inp_size = layer

        self.layer_list.append(nn.Linear(layers[-1], out_size))
        self.layer_list.append(nn.Sigmoid())
    
        self.Net = nn.Sequential(*self.layer_list)

    def forward(self, x):
        out = self.Net(x)
        return out

In [135]:
# Defining the accuracy function
def compute_accuracy(y_pred, y_actual):
    predicted_labels = (y_pred >= 0.5).float()
    correct_predictions = (predicted_labels == y_actual).sum().item()
    total_predictions = y_actual.size(0)
    accuracy = correct_predictions / total_predictions
    return accuracy

In [136]:
# initializing a model

torch.manual_seed(100)
Net = Network(X_train.shape[1], 2, [200, 150])
Net

Network(
  (Net): Sequential(
    (0): Linear(in_features=1000, out_features=200, bias=True)
    (1): ReLU()
    (2): Linear(in_features=200, out_features=150, bias=True)
    (3): ReLU()
    (4): Linear(in_features=150, out_features=2, bias=True)
    (5): Sigmoid()
  )
)

In [137]:
# Defining loss and optimizer function
loss_function = nn.BCELoss()
optimizer = torch.optim.SGD(Net.parameters(), lr=0.001, momentum=0.9)
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

In [138]:
Epochs = 10
Loss = []
Accuracy = []

for epoch in range(Epochs):

    epoch_loss = 0.0
    epoch_accuracy = 0.0
    for batchNumber, data in enumerate(trainLoader, 0):
        
        optimizer.zero_grad()
        y_pred = Net(data[0])
        y_pred = torch.max(y_pred, dim=1, keepdim=True)[0]
        loss = loss_function(y_pred, data[1])
        accuracy = compute_accuracy(y_pred, data[1])
        loss.backward()
        optimizer.step()

        epoch_loss += loss
        epoch_accuracy+= accuracy
    Loss.append(epoch_loss / batchNumber)
    Accuracy.append(epoch_accuracy / batchNumber)

    print(f"Epoch ===> {epoch + 1}, Loss ===> {(epoch_loss / batchNumber):.5f}, Accuracy ===> {(epoch_accuracy / batchNumber):.2%}")

Epoch ===> 1, Loss ===> 0.59698, Accuracy ===> 67.39%
Epoch ===> 2, Loss ===> 0.51456, Accuracy ===> 74.14%
Epoch ===> 3, Loss ===> 0.51030, Accuracy ===> 74.37%
Epoch ===> 4, Loss ===> 0.50750, Accuracy ===> 74.53%
Epoch ===> 5, Loss ===> 0.50490, Accuracy ===> 74.73%
Epoch ===> 6, Loss ===> 0.50263, Accuracy ===> 74.86%
Epoch ===> 7, Loss ===> 0.50050, Accuracy ===> 75.01%
Epoch ===> 8, Loss ===> 0.49836, Accuracy ===> 75.16%
Epoch ===> 9, Loss ===> 0.49630, Accuracy ===> 75.28%
Epoch ===> 10, Loss ===> 0.49421, Accuracy ===> 75.48%


In [139]:
epoch_test_loss = 0.0
epoch_test_accuracy = 0.0

for batchNumber, data in enumerate(testLoader, 0):

    y_pred = Net(data[0])
    y_pred = torch.max(y_pred, dim=1, keepdim=True)[0]
    loss = loss_function(y_pred, data[1])
    accuracy = compute_accuracy(y_pred, data[1])

    epoch_test_loss += loss
    epoch_test_accuracy+= accuracy

print(f"Loss ===> {(epoch_test_loss / batchNumber):.2f}, Accuracy ===> {(epoch_test_accuracy / batchNumber):.2%}")

Loss ===> 0.50, Accuracy ===> 74.97%
