### Feature Engineering

With these information found in the EDA, I have engineered a few additional date-time and numerical features in addition to the text data, including:
1. Create the hour,day, month,weekday columns from the date column.
2. Extract the hashtags and hashtag counts from the text column
3. Extract the mentions and mention counts from the text column
4. Extract the tagged URL and URL counts from the text column

In [None]:
# Import relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read the raw data
train = pd.read_csv('data/training.1600000.processed.noemoticon.csv',
                       encoding = "ISO-8859-1", engine='python',header=None,
                      names=['sentiment','tweet_id','date','flag','user','text'])

**Split `date` column into `hour`,`day`, `month`,`weekday` columns, using `train['date'].str[x:y]`**

In [None]:
# Hours
train['date_hour'] = train['date'].str[11:13]
# Days
train['date_day'] = train['date'].str[8:10].astype(int)
# Month
train['date_month'] = train['date'].str[4:7]
# Weekday
train['date_weekday'] = train['date'].str[0:4]

In [None]:
# Convert month to numerical values
train['date_month'].replace('Apr', 4, inplace=True)
train['date_month'].replace('May', 5, inplace=True)
train['date_month'].replace('Jun', 6, inplace=True)

In [None]:
# Convert weekday to numerical values
train['date_weekday'].replace('Mon ', 1, inplace=True)
train['date_weekday'].replace('Tue ', 2, inplace=True)
train['date_weekday'].replace('Wed ', 3, inplace=True)
train['date_weekday'].replace('Thu ', 4, inplace=True)
train['date_weekday'].replace('Fri ', 5, inplace=True)
train['date_weekday'].replace('Sat ', 6, inplace=True)
train['date_weekday'].replace('Sun ', 7, inplace=True)

In [None]:
# Drop the date column
train.drop(['date'], axis=1, inplace=True)

**From the `text` column, extract:**

  - hashtags
  - hashtag counts
  - tagged users
  - number of tagged users
  - websites
  - number of websites

Then, remove these extra informations from the main text

***Hashtags & Hashtag Counts***

In [None]:
# Retrieve the hashtags and add the column to the dataset
hashtags = []
for tweet in train['text']:
    hashtags.append([i  for i in tweet.split() if i.startswith("#") ])

train['hashtags'] = hashtags

In [None]:
# Find the total and unique number of hashtags
hashtags_flat = []
for sublist in hashtags:
    for item in sublist:
        hashtags_flat.append(item)

print(f'Total Hashtag Counts:{len(hashtags_flat)}') 
print(f'Unique Hashtag Counts:{len(set(hashtags_flat))}')

In [None]:
# Find number of hashtags in each tweet
hashtag_counts = []
for hashtag in hashtags:
    hashtag_counts.append(len(hashtag))

train['hashtag_counts'] = hashtag_counts

In [None]:
# Remove hashtags column
train.drop(['hashtags'],axis=1,inplace=True)

***Tagged Users & Counts***

In [None]:
# Retrieve the user names and add the column to the dataset
users = []
for tweet in train['text']:
    users.append([i for i in tweet.split() if i.startswith("@") ])

train['users'] = users

In [None]:
# Find the total and unique number of users
users_flat = []
for sublist in users:
    for item in sublist:
        users_flat.append(item)

print(f'Total User Counts:{len(users_flat)}') 
print(f'Unique User Counts:{len(set(users_flat))}')

In [None]:
# Find number of tagged users in each tweet
user_counts = []
for user in users:
    user_counts.append(len(user))

train['user_counts'] = user_counts

In [None]:
train.drop(['user'],axis=1,inplace=True)
train.drop(['users'],axis=1,inplace=True)

In [None]:
train.head()

***Tagged Websites & Counts***

In [None]:
# Retrieve urls from tweets using URLExtract
# from urlextract import URLExtract

# extractor = URLExtract()
#urls = []

# for i in range(len(train)):
    #urls.append(extractor.find_urls(train['text'][i]))

In [None]:
# Save partial dataset
# train.to_csv(r'/Users/JennaFu/Desktop/DataScience/BrainStation/Capstone/data/partial_data.csv', index = False)

In [None]:
train = pd.read_csv('data/partial_data.csv',index_col = 0)

In [None]:
# Remove '][' from the `urls` column
train['urls'] = train['urls'].str[1:-1]
# Split the urls by ','
train['urls'] = train['urls'].str.split(", ")

In [None]:
# Find number of urls in each tweet
url_counts = []
for url in train['urls']:
    if url[0] == '':
        url_counts.append(0)
    else:
        url_counts.append(len(url))
train['url_counts'] = url_counts

***Remove excessive information from `text` Column***

In [None]:
import re
# Creating a function called clean, that removes all hyperlink, hashtags, mentions and emojis
def clean(x):
    x = re.sub(r"^RT[\s]+", "", x)
    x = re.sub(r"https?:\/\/.*[\r\n]*", "", x)
    x = re.sub('[^ ]+\.[^ ]+','',x)
    x = re.sub(r"#","", x)
    x = re.sub(r"@[A-Za-z0–9]+","", x)
    return x  

In [None]:
# Apply the clean function to text column
train['text'] = train['text'].apply(clean)

In [None]:
# Remove the url, user columns from dataset and remove hastag symbols from hashtag column
train.drop(['hashtags'],axis=1,inplace=True)
train.drop(['user'],axis=1,inplace=True)
train.drop(['users'],axis=1,inplace=True)
train.drop(['urls'],axis=1,inplace=True)
train.drop(['tweet_id'],axis=1,inplace=True)

In [None]:
# train.to_csv(r'/Users/JennaFu/Desktop/DataScience/BrainStation/Capstone/data/feature_engineered_2.csv', index = False)

***Preprocess the `text` column***

Before vectorizing, I have to pre-process the `text` column:

- Converting all letters to lower case.
- Turning the tweets into tokens. Tokens are words separated by spaces in a text.
- Eliminating unwanted characters, such as punctuation marks, special characters, white spaces etc.
- Remove stop words, defined by the nltk library.
- Apply lemmatization, and returns a word to its base or dictionary form. Example: Better -> Good.

In [None]:
# Import relevant packages
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')

In [None]:
# Global Parameters
stop_words = set(stopwords.words('english'))

I have chosen to use the POS tags, as by default, the lemmatizer takes in an input string and tries to lemmatize it, so if you pass in a word, it would lemmatize it treating it as a noun. Hence, To make the lemmatization better and context dependent, we would need to find out the POS tag and pass it on to the lemmatizer. 

In [None]:
# Lemmatize with POS Tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
# Create a function preprocess_tweet_text
def preprocess_tweet_text(tweet):
    # Convert all characters to lower case
    tweet.lower()
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w,get_wordnet_pos(w)) for w in filtered_words])
    return " ".join(filtered_words)

In [None]:
# Apply the preprocess_tweet_text to text column
train['text'] = train['text'].apply(preprocess_tweet_text)

In [None]:
# Save preprocessed dataset
train.to_csv(r'/Users/JennaFu/Desktop/DataScience/BrainStation/Capstone/data/preprocess_data.csv', index = False)