In [None]:
# import all libraries
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import string
import random
import nltk
import re

In [None]:
train_name = 'Tweets.csv' # US Airlines dataset
label_column = 'airline_sentiment'
text_column = 'text'

hiperlink_string = ' '
hashtag_string = ' '
mention_string = ' '
retweet_string = ' '

In [None]:
# read data
df_train = pd.read_csv('US Airlines Sentiment/' + train_name, encoding='latin-1')
df_train, df_test = train_test_split(df_train, test_size=0.2, shuffle=True, stratify = df_train[label_column])

df_train = df_train[[text_column, label_column]].rename(columns={text_column: 'Text', label_column: 'Sentiment'})
df_test = df_test[[text_column, label_column]].rename(columns={text_column: 'Text', label_column: 'Sentiment'})

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

df_train

Unnamed: 0,Text,Sentiment
0,"@AmericanAir no, you should do something about...",negative
1,"@JetBlue Hold for 15 min, a couple of rings, t...",negative
2,@VirginAmerica Iâm having trouble adding thi...,negative
3,@USAirways Why can't I check in online for a f...,negative
4,"@southwestair *any site*? gmail, facebook, etc.",neutral
...,...,...
11707,@jetblue who's running your tweeter using the ...,neutral
11708,@AmericanAir Phone just disconnects if you sta...,negative
11709,@AmericanAir I FOUND MY FOOTAGE!! :D I am so s...,positive
11710,@JetBlue in the sky on flight 833 from BOS to ...,positive


Removing stopwords with NLTK.corpus library. Stopwords are:
> I, me, my, myself, we, our, you've, you'll, you'd, your, she's, her, hers, herself....

In [None]:
# download the stopwords from NLTK
nltk.download('stopwords')

#Import the english stop words list from NLTK
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer() # instantiate stemmer

df_train["Tokens"] = "" # add additional column to dataframe
df_test["Tokens"] = ""

# instantiate tokenizer class
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Preprocessing of each tweet:
1.   remove old style retweet text "RT"
2.   replace hiperlink with postojaojelink
3.  remove hashtags, only removing the hash # sign from the word

Preprocess every single tweet.



In [None]:
def preprocessing(df_train):

  df_train['Text'] = df_train['Text'].apply(str)

  for i in range(0,len(df_train)):
    tweet = df_train.at[i,'Text']
    tweet =  re.sub(r'^RT[\s]+', retweet_string, tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', hiperlink_string, tweet)
    tweet = re.sub(r'#', '', tweet)
    tweet = re.sub(r"@\w+", mention_string,tweet)
    # tokenize tweets
    df_train.at[i,'Tokens'] = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in df_train.at[i,'Tokens']: # Go through every word in your tokens list
      if word == 'coronavirus':
        word = 'covid'  # coronavirus and covid are semantically the same
      if (word not in stopwords_english and word not in string.punctuation):
        tweets_clean.append(stemmer.stem(word)) #stemming word
    if len(tweets_clean) > 0:
      df_train.at[i,'Tokens'] = tweets_clean
    else:
      df_train.at[i,'Tokens'] = ''

  return df_train

In [None]:
df_train = preprocessing(df_train)
df_test = preprocessing(df_test)

Remove all tweets left with no tokens.

In [None]:
# Delete a single column from the DataFrame
df_train = df_train[df_train.Tokens != '']
df_test = df_test[df_test.Tokens != '']

In [None]:
df_train

Unnamed: 0,Text,Sentiment,Tokens
0,"@AmericanAir no, you should do something about...",negative,"[someth, everyth, happen, yesterday, ...]"
1,"@JetBlue Hold for 15 min, a couple of rings, t...",negative,"[hold, 15, min, coupl, ring, mailbox, set, yet..."
2,@VirginAmerica Iâm having trouble adding thi...,negative,"[iâ, , , troubl, ad, flight, wife, book, ele..."
3,@USAirways Why can't I check in online for a f...,negative,"[can't, check, onlin, flight, tomorrow]"
4,"@southwestair *any site*? gmail, facebook, etc.",neutral,"[site, gmail, facebook, etc]"
...,...,...,...
11707,@jetblue who's running your tweeter using the ...,neutral,"[who', run, tweeter, use, word, fleek]"
11708,@AmericanAir Phone just disconnects if you sta...,negative,"[phone, disconnect, stay, line, need, checkout..."
11709,@AmericanAir I FOUND MY FOOTAGE!! :D I am so s...,positive,"[found, footag, :d, happi]"
11710,@JetBlue in the sky on flight 833 from BOS to ...,positive,"[sky, flight, 833, bo, sfo, awesom, crew, hele..."


In [None]:
df_test

Unnamed: 0,Text,Sentiment,Tokens
0,@JetBlue is it your standard protocol to call ...,negative,"[standard, protocol, call, secur, onto, plane,..."
1,@SouthwestAir Is my friend lucky enough to see...,negative,"[friend, lucki, enough, see, destinationdragon..."
2,@SouthwestAir any spare tickets for Vegas? Wou...,neutral,"[spare, ticket, vega, would, forev, grate, des..."
3,@united i did but i got nothing from it. Just ...,negative,"[got, noth, dissapoint, =(]"
4,@southwestair watching planes do their thing h...,neutral,"[watch, plane, thing]"
...,...,...,...
2923,@SouthwestAir @Imaginedragons I tried. ð It...,neutral,"[tri, ð, , , , okay]"
2924,@USAirways YOU ARE THE BEST AIRWAYS!!!!!!!!!! ...,positive,"[best, airway, follow, back, pleas, ð, , , ..."
2925,@SouthwestAir yall still fly in the cold right?,neutral,"[yall, still, fli, cold, right]"
2926,@USAirways dealing w fam emergency. Was told w...,negative,"[deal, w, fam, emerg, told, rebook, aa, flight..."


In [None]:
df_train['Sentiment'].to_csv('US Airlines Sentiment/train_sentiment.csv', encoding='latin-1', index = False)
df_test['Sentiment'].to_csv('US Airlines Sentiment/test_sentiment.csv', encoding='latin-1', index = False)

In [None]:
def parse_data(data):

  sentence = []
  vocabulary = set()

  for i in range(0,len(data)):
   #list_of_words = re.sub("[^\w]", " ", data.iloc[i]['Tokens']).split()
    list_of_words = data.iloc[i]['Tokens']
    sentence.append(list_of_words)
    vocabulary = vocabulary.union(set(list_of_words))

  return sentence, vocabulary

In [None]:
def fit_tokenizer(sentences):
    tokenizer = Tokenizer(oov_token = '<OOV>', num_words=10000)
    tokenizer.fit_on_texts(sentences)
    return tokenizer

In [None]:
def get_padded_sequences(tokenizer, sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    padded_sequences = pad_sequences(sequences, padding = 'post', maxlen=39)
    return padded_sequences

In [None]:
sentence_train, vocabulary_train = parse_data(df_train)
tokenizer = fit_tokenizer(sentence_train)
word_index = tokenizer.word_index
padded_sequence_train = get_padded_sequences(tokenizer, sentence_train)
sentence_test, vocabulary_test = parse_data(df_test)
padded_sequence_test = get_padded_sequences(tokenizer, sentence_test)

In [None]:
pd.DataFrame(padded_sequence_train).to_csv('US Airlines Sentiment/train_encoded.csv', encoding='latin-1', index = False)
pd.DataFrame(padded_sequence_test).to_csv('US Airlines Sentiment/test_encoded.csv', encoding='latin-1', index = False)