In [3]:
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input,Flatten,Embedding, Dropout,Conv1D,MaxPooling1D, Dense, GlobalMaxPooling1D,BatchNormalization, Add,GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.applications import VGG16

In [4]:
# import nltk
# nltk.download('stopwords')

In [5]:
data = pd.read_csv('./data/training.1600000.processed.noemoticon.csv',encoding = 'latin',header=None)

In [6]:
data = data[[5, 0]]
data.columns=['tweet', 'sentiment']

In [7]:
data['sentiment'] = data['sentiment'].replace(4,1)

In [8]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    # Removing URLS
    text = re.sub(r"https?://\S+|www\.\S+"," ",text)
    
    # Removing html tags
    text = re.sub(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});"," ",text)
    
    # Removing the Punctuation
    text = re.sub(r"[^\w\s]", " ", text)
        
    # Removing words that have numbers 
    text = re.sub(r"\w*\d\w*", " ", text)
        
    # Removing Digits 
    text = re.sub(r"[0-9]+", " ", text)
        
    # Cleaning white spaces
    text = re.sub(r"\s+", " ", text).strip()
        
    text = text.lower()
    # Check stop words
    tokens = []
    for token in text.split():
        if token not in stop_words and len(token) > 3:
            tokens.append(token)
    return " ".join(tokens)

In [9]:
data.loc[:, 'tweet'] = data['tweet'].apply(preprocess)

In [30]:
X = data['tweet']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=7)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=2./9, random_state=7)

print("Train Data size:", len(X_train), len(y_train))
print("Validation Data size:", len(X_val), len(y_val))
print("Test Data size", len(X_test), len(y_test))

Train Data size: 1120000 1120000
Validation Data size: 320000 320000
Test Data size 160000 160000


In [32]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert text to sequences of integers
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

In [33]:
max_length = max([len(seq) for seq in X_train])
X_train = pad_sequences(X_train, maxlen=max_length)
X_val = pad_sequences(X_val, maxlen=max_length)
X_test = pad_sequences(X_test, maxlen=max_length)
print(f"After padding: {X_train.shape}")
print(f"After padding: {X_val.shape}")
print(f"After padding:{X_test.shape}")

After padding: (1120000, 27)
After padding: (320000, 27)
After padding:(160000, 27)


In [38]:
np.save('X_train.npy', X_train)
np.save('X_val.npy', X_val)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_val.npy', y_val)
np.save('y_test.npy', y_test)