In [60]:
!pip install numpy pandas scikit-learn gensim torch nltk spacy keras



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Setup**

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
import torch
from torch import nn
import spacy
from spacy.cli import download
from collections import defaultdict

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from keras.preprocessing.sequence import pad_sequences

download("en_core_web_sm")

columns = ["target", "ids", "date", "flag", "user", "text"]
nrows = 10000
data = pd.read_csv("/content/drive/MyDrive/Colab Data/training.1600000.processed.noemoticon.csv", names=columns, encoding = "ISO-8859-1", nrows=nrows)
data.head()
print(data.head())

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
   target         ids                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I

In [4]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

In [5]:
# Preprocess
nltk.download("stopwords")
nltk.download('punkt_tab')

TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# data.target = data.target.apply(lambda x: decode_sentiment(x))
if data.target.dtype != object:  # If not object (string) type, then decode
    data.target = data.target.apply(lambda x: decode_sentiment(x))


data.text.apply(lambda x: preprocess(x))
train, test = train_test_split(data, test_size = 0.2, shuffle=True)

documents = [text.split() for text in train.text]
print(documents)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!




In [6]:
w2v = gensim.models.word2vec.Word2Vec(
    vector_size=W2V_SIZE,
    window=W2V_WINDOW,
    epochs=W2V_EPOCH,
    min_count=W2V_MIN_COUNT
)
w2v.build_vocab(documents)
w2v.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

(1697176, 3484384)

# **Tokenization**

In [7]:
vocab = defaultdict(int)
count = 0

def fit_tokenizer(text: str):
    global count
    tokens = nltk.tokenize.word_tokenize(text)
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    for token in tokens:
        if token not in vocab:
            vocab[token] = count
            count += 1

def text_to_sequence(text: str):
    tokens = nltk.tokenize.word_tokenize(text)
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    seq = [vocab[token] for token in tokens]
    return seq

def texts_to_sequences(texts: list):
    result = []
    for text in texts:
        result.append(text_to_sequence(text))
    return result

In [9]:
# Tokenize
for text in train.text:
    fit_tokenizer(text)


x_train = pad_sequences(texts_to_sequences(train.text), maxlen=300)
x_test = pad_sequences(texts_to_sequences(test.text), maxlen=300)

In [10]:
labels = train.target.unique().tolist()
labels.append("NEUTRAL")
labels

['NEGATIVE', 'NEUTRAL']

In [16]:
encoder = LabelEncoder()
encoder.fit(train.target.tolist())

y_train = encoder.transform(train.target.tolist())
y_test = encoder.transform(test.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [65]:
model = nn.Sequential([

])