### Feature Engineering

With these information found in the EDA, I have engineered a few additional date-time and numerical features in addition to the text data, including:
1. Create the hour,day, month,weekday columns from the date column.
2. Extract the hashtags and hashtag counts from the text column
3. Extract the mentions and mention counts from the text column
4. Extract the tagged URL and URL counts from the text column

In [20]:
# Import relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import warnings
warnings.filterwarnings('ignore')

In [21]:
# Read the raw data
train = pd.read_csv('data/training.1600000.processed.noemoticon.csv',
                       encoding = "ISO-8859-1", engine='python',header=None,
                      names=['sentiment','tweet_id','date','flag','user','text'])

In [22]:
# Conversion and removal of rows and columns explained in EDA
# Convert binary classification from {0,4} to {0,1}
def convert_sent(x):
    if x == 4:
        return 1
    else:
        return 0
train['sentiment'] = train['sentiment'].apply(convert_sent)
# Find row indexes where the tweed id has exactly one entry
id_count = train['tweet_id'].value_counts()
id_count[id_count.values == 1].index
train = train[train['tweet_id'].isin(id_count[id_count.values == 1].index)].reset_index(drop=True)
# Drop date column
train = train.drop(['date'],axis=1)
# Drop flag column
train = train.drop(['flag'],axis=1)
# Remove texts appearing more than 20 times
train = train[train.groupby('text')['text'].transform('count') < 20].reset_index(drop=True)

In [23]:
# Datset shape
train.shape

(1592831, 4)

**From the `text` column, extract:**

  - hashtags
  - hashtag counts
  - tagged users
  - number of tagged users
  - websites
  - number of websites

Then, remove these extra informations from the main text

***Hashtags & Hashtag Counts***

In [8]:
# Retrieve the hashtags and add the column to the dataset
hashtags = []
for tweet in train['text']:
    hashtags.append([i  for i in tweet.split() if i.startswith("#") ])

train['hashtags'] = hashtags

In [9]:
# Find the total and unique number of hashtags
hashtags_flat = []
for sublist in hashtags:
    for item in sublist:
        hashtags_flat.append(item)

print(f'Total Hashtag Counts:{len(hashtags_flat)}') 
print(f'Unique Hashtag Counts:{len(set(hashtags_flat))}')

Total Hashtag Counts:44109
Unique Hashtag Counts:15293


In [10]:
# Find number of hashtags in each tweet
hashtag_counts = []
for hashtag in hashtags:
    hashtag_counts.append(len(hashtag))

train['hashtag_counts'] = hashtag_counts

In [11]:
# Remove hashtags column
train.drop(['hashtags'],axis=1,inplace=True)

***Tagged Users & Counts***

In [12]:
# Retrieve the user names and add the column to the dataset
users = []
for tweet in train['text']:
    users.append([i for i in tweet.split() if i.startswith("@") ])

train['users'] = users

In [13]:
# Find the total and unique number of users
users_flat = []
for sublist in users:
    for item in sublist:
        users_flat.append(item)

print(f'Total User Counts:{len(users_flat)}') 
print(f'Unique User Counts:{len(set(users_flat))}')

Total User Counts:793951
Unique User Counts:364032


In [14]:
# Find number of tagged users in each tweet
user_counts = []
for user in users:
    user_counts.append(len(user))

train['user_counts'] = user_counts

In [15]:
train.drop(['user'],axis=1,inplace=True)
train.drop(['users'],axis=1,inplace=True)

In [16]:
train.head()

Unnamed: 0,sentiment,tweet_id,flag,text,date_hour,date_day,date_month,date_weekday,hashtag_counts,user_counts
0,0,1467810369,NO_QUERY,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",22,6,4,1,0,1
1,0,1467810672,NO_QUERY,is upset that he can't update his Facebook by ...,22,6,4,1,0,0
2,0,1467810917,NO_QUERY,@Kenichan I dived many times for the ball. Man...,22,6,4,1,0,1
3,0,1467811184,NO_QUERY,my whole body feels itchy and like its on fire,22,6,4,1,0,0
4,0,1467811193,NO_QUERY,"@nationwideclass no, it's not behaving at all....",22,6,4,1,0,1


***Tagged Websites & Counts***

In [17]:
# Retrieve urls from tweets using URLExtract
# from urlextract import URLExtract

# extractor = URLExtract()
#urls = []

# for i in range(len(train)):
    #urls.append(extractor.find_urls(train['text'][i]))

In [18]:
# Save partial dataset
# train.to_csv(r'/Users/JennaFu/Desktop/DataScience/BrainStation/Capstone/data/partial_data.csv', index = False)

In [19]:
train = pd.read_csv('data/partial_data.csv',index_col = 0)

In [20]:
# Remove '][' from the `urls` column
train['urls'] = train['urls'].str[1:-1]
# Split the urls by ','
train['urls'] = train['urls'].str.split(", ")

In [21]:
# Find number of urls in each tweet
url_counts = []
for url in train['urls']:
    if url[0] == '':
        url_counts.append(0)
    else:
        url_counts.append(len(url))
train['url_counts'] = url_counts

***Remove excessive information from `text` Column***

In [25]:
import re
# Creating a function called clean, that removes all hyperlink, hashtags, mentions and emojis
def clean(x):
    x = re.sub(r"^RT[\s]+", "", x)
    x = re.sub(r"https?:\/\/.*[\r\n]*", "", x)
    x = re.sub('[^ ]+\.[^ ]+','',x)
    x = re.sub(r"#","", x)
    x = re.sub(r"@[A-Za-z0–9]+","", x)
    return x  

In [26]:
# Apply the clean function to text column
train['text'] = train['text'].apply(clean)

In [27]:
# Remove the url, user columns from dataset and remove hastag symbols from hashtag column
#train.drop(['hashtags'],axis=1,inplace=True)
train.drop(['user'],axis=1,inplace=True)
#train.drop(['users'],axis=1,inplace=True)
#train.drop(['urls'],axis=1,inplace=True)
train.drop(['tweet_id'],axis=1,inplace=True)

In [None]:
# train.to_csv(r'/Users/JennaFu/Desktop/DataScience/BrainStation/Capstone/data/feature_engineered_2.csv', index = False)

***Preprocess the `text` column***

Before vectorizing, I have to pre-process the `text` column:

- Converting all letters to lower case.
- Turning the tweets into tokens. Tokens are words separated by spaces in a text.
- Eliminating unwanted characters, such as punctuation marks, special characters, white spaces etc.
- Remove stop words, defined by the nltk library.
- Apply lemmatization, and returns a word to its base or dictionary form. Example: Better -> Good.

In [32]:
# Import relevant packages
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')

In [33]:
# Global Parameters
stop_words = set(stopwords.words('english'))

I have chosen to use the POS tags, as by default, the lemmatizer takes in an input string and tries to lemmatize it, so if you pass in a word, it would lemmatize it treating it as a noun. Hence, To make the lemmatization better and context dependent, we would need to find out the POS tag and pass it on to the lemmatizer. 

In [34]:
# Lemmatize with POS Tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [35]:
# Create a function preprocess_tweet_text
def preprocess_tweet_text(tweet):
    # Convert all characters to lower case
    tweet.lower()
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in filtered_words])
    return " ".join(filtered_words)

In [36]:
# Apply the preprocess_tweet_text to text column
train['text'] = train['text'].apply(preprocess_tweet_text)

In [39]:
# Save preprocessed dataset
# train.to_csv(r'/Users/JennaFu/Desktop/DataScience/BrainStation/howistwitterfeeling/jupyter notebook/new_data/preprocess_data.csv', index = False)