In [1]:
import pandas as pd
import nltk, re
import numpy as np
from tensorflow import keras
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer

### Split into train and validation

In [2]:
df = pd.read_csv('../data/org_train.csv',encoding='latin1',usecols=['text','sentiment'])
train_df, val_df = train_test_split(df,test_size=0.2,random_state=42)

train_df.to_csv('../data/train.csv',index=False)
val_df.to_csv('../data/val.csv',index=False)

### Read the CSV

In [3]:
train_df = pd.read_csv('../data/train.csv')
val_df = pd.read_csv('../data/org_test.csv', encoding='latin1', usecols=['text','sentiment'])

In [4]:
# drop null
train_df = train_df.dropna()
val_df = val_df.dropna()

In [5]:
# lowercase all text
train_df['text'] = train_df['text'].str.lower()
val_df['text'] = val_df['text'].str.lower()

# keep number and characters only
train_df['text'] = train_df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
val_df['text'] = val_df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [6]:
# word tokenization
train_df['text'] = train_df['text'].apply(lambda x : nltk.word_tokenize(x))
val_df['text'] = val_df['text'].apply(lambda x : nltk.word_tokenize(x))

In [7]:
# removing stop words
stop_words = stopwords.words('english')
train_df['text'] = train_df['text'].apply(lambda x : np.setdiff1d(x,stop_words))
val_df['text'] = val_df['text'].apply(lambda x : np.setdiff1d(x,stop_words))

In [8]:
# remove text with no words
train_df['len'] = train_df['text'].apply(lambda x : len(x))
train_df = train_df[train_df['len'] > 0]
train_df.drop('len',axis=1,inplace=True)

val_df['len'] = val_df['text'].apply(lambda x : len(x))
val_df = val_df[val_df['len'] > 0]
val_df.drop('len',axis=1,inplace=True)

In [9]:
# stemming
ps = PorterStemmer()
train_df['text'] = train_df['text'].apply(lambda x : [ps.stem(word) for word in x])
val_df['text'] = val_df['text'].apply(lambda x : [ps.stem(word) for word in x])

In [10]:
# convert to statement
train_df['text'] = train_df['text'].apply(lambda x : ' '.join(x))
val_df['text'] = val_df['text'].apply(lambda x : ' '.join(x))

In [11]:
# define tokenizer
max_features = 4000
tokenizer = Tokenizer(num_words = max_features, split=' ')
tokenizer.fit_on_texts(train_df['text'].values)

In [12]:
# tokenizing
train_df['text'] = tokenizer.texts_to_sequences(train_df['text'])
val_df['text'] = tokenizer.texts_to_sequences(val_df['text'])

In [13]:
train_df.to_pickle('../data/train_processed.pkl')
val_df.to_pickle('../data/test_processed.pkl')