<a href="https://colab.research.google.com/github/kvamleik/NLP-project/blob/main/Data_preprocessing_EasyJet_ORIGINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy 
import string

In [None]:
df = pd.read_csv("/Users/nikolajfrandsen/Desktop/Master Thesis Data/Twitter/EasyJet/EasyJet_Twitter_CLEAN.csv")

In [None]:
df.head()

Unnamed: 0,Date,Country,Hit Sentence
0,01-Aug-2019 01:00AM,United Kingdom,=@bell_allie @SouthendAirport @easyJet Probabl...
1,01-Aug-2019 01:01AM,Unknown,@RGrosjean @easyJet Go Pietro Go!!!____
2,01-Aug-2019 01:01PM,Unknown,@easyJet why does your submission form on your...
3,01-Aug-2019 01:01PM,Italy,"@Gatwick_Airport Hi, my flight to Pisa, with E..."
4,01-Aug-2019 01:02AM,Unknown,=@brothers_beyond @easyJet Hi @brothers_beyond...


# Lower casing

In [None]:
df['text_lower'] = df['Hit Sentence'].str.lower()
df['text_lower'].head()

0    =@bell_allie @southendairport @easyjet probabl...
1              @rgrosjean @easyjet go pietro go!!!____
2    @easyjet why does your submission form on your...
3    @gatwick_airport hi, my flight to pisa, with e...
4    =@brothers_beyond @easyjet hi @brothers_beyond...
Name: text_lower, dtype: object

# Removal of punctuation

In [None]:
df['text_punct'] = df['text_lower'].str.replace('[^\w\s]','')
df['text_punct'].head()

0    bell_allie southendairport easyjet probably to...
1                   rgrosjean easyjet go pietro go____
2    easyjet why does your submission form on your ...
3    gatwick_airport hi my flight to pisa with easy...
4    brothers_beyond easyjet hi brothers_beyond and...
Name: text_punct, dtype: object

# Stop-word removal

In [None]:
# Importing stopwords from nltk library (Sometimes it is necessart to use another import - system will show so)
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

In [None]:
# Function to remove the stopwords
def stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])


In [None]:
# Applying the stopwords to 'text_punct' and store into 'text_stop'
df['text_stop'] = df['text_punct'].apply(stopwords)
df['text_stop'].head()

0    bell_allie southendairport easyjet probably fl...
1                   rgrosjean easyjet go pietro go____
2    easyjet submission form website work sent emai...
3    gatwick_airport hi flight pisa easyjet delayed...
4    brothers_beyond easyjet hi brothers_beyond eas...
Name: text_stop, dtype: object

# Common word removal

In [None]:
# Checking the first 10 most frequent words
from collections import Counter
cnt = Counter()
for text in df["text_stop"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

[('easyjet', 128450),
 ('mattiasharris', 50212),
 ('rt', 49720),
 ('flight', 43365),
 ('ryanair', 38283),
 ('easyjet_press', 37440),
 ('easa', 36960),
 ('iata', 36955),
 ('geneveaeroport', 36864),
 ('seats', 35999)]

In [None]:
# Removing the frequent words

freq = set([w for (w, wc) in cnt.most_common(10)])

# Function to remove the frequent words

def freqwords(text):
    return " ".join([word for word in str(text).split() if word not in freq])

# Passing the function to freqwords

df["text_common"] = df["text_stop"].apply(freqwords)
df["text_common"].head()


0    bell_allie southendairport probably fly schedu...
1                           rgrosjean go pietro go____
2    submission form website work sent email receiv...
3    gatwick_airport hi pisa delayed 2 hours kind s...
4    brothers_beyond hi brothers_beyond ask ref lut...
Name: text_common, dtype: object

# Rare word removal

In [None]:
# Removal of 10 rare words in store into new coloumn called 'text_rare'
freq = pd.Series(' '.join(df['text_common']).split()).value_counts()[-10:] #10 rare words
freq = list(freq.index)
df['text_rare'] = df['text_common'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

df['text_rare'].head()

0    bell_allie southendairport probably fly schedu...
1                           rgrosjean go pietro go____
2    submission form website work sent email receiv...
3    gatwick_airport hi pisa delayed 2 hours kind s...
4    brothers_beyond hi brothers_beyond ask ref lut...
Name: text_rare, dtype: object

# Spelling correction

In [None]:
pip install textblob



In [None]:
# Spell check using text blob for the first 5 records
from textblob import TextBlob
df['text_rare'][:5].apply(lambda x: str(TextBlob(x).correct()))

NameError: ignored

# Emoji removal

In [1]:
# Function to remove emoji.
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
emoji = ("Hi, I am Emoji  😜")

#passing the emoji function to 'text_rare'
df['text_rare'] = df['text_rare'].apply(remove_emoji)

NameError: ignored

In [None]:
df.head()

# Emoticon removal

In [None]:
pip install emot

Note: you may need to restart the kernel to use updated packages.


In [None]:
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

# Function for removing emoticons
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)
remove_emoticons("Hello :-)")

# applying remove_emoticons to 'text_rare'
df['text_rare'] = df['text_rare'].apply(remove_emoticons)

In [None]:
df['text_rare'].head()

0    bell_allie southendairport probably fly schedu...
1                           rgrosjean go pietro go____
2    submission form website work sent email receiv...
3    gatwick_airport hi pisa delayed 2 hours kind s...
4    brothers_beyond hi brothers_beyond ask ref lut...
Name: text_rare, dtype: object

# Converting emoji and emoticons to words

In [None]:
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

# Converting emojis to words
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
        return text
    
# Converting emoticons to words    
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
        return text
    
# Example
text = "Hello :-) :-)"
convert_emoticons(text)
text1 = "Hilarious 😂"
convert_emojis(text1)

# Passing both functions to 'text_rare'
df['text_rare'] = df['text_rare'].apply(convert_emoticons)
df['text_rare'] = df['text_rare'].apply(convert_emojis)

# Removal of URL's

In [None]:
# Function for url's
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    
# Examples
text = "this is my website, https://www.abc.com"
remove_urls(text)

# Passing the function to 'text_rare'
df['text_urls'] = df['text_rare'].apply(remove_urls)
df['text_urls'].head()

0    None
1    None
2    None
3    None
4    None
Name: text_urls, dtype: object

In [None]:
df.head()

Unnamed: 0,Date,Country,Hit Sentence,text_lower,text_punct,text_stop,text_common,text_rare,text_urls
0,01-Aug-2019 01:00AM,United Kingdom,=@bell_allie @SouthendAirport @easyJet Probabl...,=@bell_allie @southendairport @easyjet probabl...,bell_allie southendairport easyjet probably to...,bell_allie southendairport easyjet probably fl...,bell_allie southendairport probably fly schedu...,bell_allie southendairport probably fly schedu...,
1,01-Aug-2019 01:01AM,Unknown,@RGrosjean @easyJet Go Pietro Go!!!____,@rgrosjean @easyjet go pietro go!!!____,rgrosjean easyjet go pietro go____,rgrosjean easyjet go pietro go____,rgrosjean go pietro go____,rgrosjean go pietro go____,
2,01-Aug-2019 01:01PM,Unknown,@easyJet why does your submission form on your...,@easyjet why does your submission form on your...,easyjet why does your submission form on your ...,easyjet submission form website work sent emai...,submission form website work sent email receiv...,submission form website work sent email receiv...,
3,01-Aug-2019 01:01PM,Italy,"@Gatwick_Airport Hi, my flight to Pisa, with E...","@gatwick_airport hi, my flight to pisa, with e...",gatwick_airport hi my flight to pisa with easy...,gatwick_airport hi flight pisa easyjet delayed...,gatwick_airport hi pisa delayed 2 hours kind s...,gatwick_airport hi pisa delayed 2 hours kind s...,
4,01-Aug-2019 01:02AM,Unknown,=@brothers_beyond @easyJet Hi @brothers_beyond...,=@brothers_beyond @easyjet hi @brothers_beyond...,brothers_beyond easyjet hi brothers_beyond and...,brothers_beyond easyjet hi brothers_beyond eas...,brothers_beyond hi brothers_beyond ask ref lut...,brothers_beyond hi brothers_beyond ask ref lut...,


# Tokenization

In [None]:
# Creating function for tokenization 
def tokenization(text):
    text = re.split('\W+', text)
    return text

In [None]:
# Passing the function for tokenization 
df['text_token'] = df['text_rare'].apply(lambda x: tokenization(x.lower()))
df[['text_token']].head()

Unnamed: 0,text_token
0,"[bell_allie, southendairport, probably, fly, s..."
1,"[rgrosjean, go, pietro, go____]"
2,"[submission, form, website, work, sent, email,..."
3,"[gatwick_airport, hi, pisa, delayed, 2, hours,..."
4,"[brothers_beyond, hi, brothers_beyond, ask, re..."


# Stemming and lemmatization

In [None]:
import nltk

In [None]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nikolajfrandsen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nikolajfrandsen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [None]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV} 
# Pos tag, used Noun, Verb, Adjective, and Adverb

In [None]:
#Function for lemmatization using POS
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN))  for word, pos in pos_tagged_text])


In [None]:
#Passing the function to 'text_rare' and store in 'text_lemma'
df["text_lemma"] = df["text_rare"].apply(lemmatize_words)

In [None]:
df["text_lemma"].head()

0    bell_allie southendairport probably fly schedu...
1                           rgrosjean go pietro go____
2    submission form website work send email receiv...
3    gatwick_airport hi pisa delay 2 hour kind seri...
4    brothers_beyond hi brothers_beyond ask ref lut...
Name: text_lemma, dtype: object

In [None]:
df.count()

Date            89544
Country         89544
Hit Sentence    89544
text_lower      89544
text_punct      89544
text_stop       89544
text_common     89544
text_rare       89544
text_urls           0
text_token      89544
text_lemma      89544
dtype: int64

In [None]:
df.to_csv('/Users/nikolajfrandsen/Desktop/EasyJet_PreProcessed_ORIGINAL.csv')