In [1]:
import ast
import warnings
import textwrap

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm

# NLTK packages
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Spacy
import spacy

warnings.filterwarnings("ignore")

# For better visualizations
sns.set(style="ticks", palette="muted", color_codes=True)

plt.rcParams['axes.labelsize'] = 15
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.weight'] = 'bold'

# downloading corpus
nltk.download('stopwords')
nltk.download('punkt')

stopwords = set(stopwords.words("english"))

# Remove 'not' from stopword list
stopwords.remove('not')

# Load the spaCy English model
# https://github.com/explosion/spaCy/issues/6498
nlp = spacy.load("en_core_web_lg", disable=["parser", 'ner'])
# nlp.add_pipe('sentencizer')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def wrap_text(text):
    return textwrap.fill(text, width=85)

In [3]:
df=pd.read_csv("qatarairways_tweets_sentiments.csv", parse_dates=['date'])

In [4]:
df.head()

Unnamed: 0,link,text,date,Likes,Comments,hashtags,sentiment
0,https://twitter.com/juliet_gough/status/174158...,It was fantastic service onboard. I'm so impre...,2023-12-31 22:23:00+00:00,0,0,[],"{'label': 'positive', 'score': 0.9895235}"
1,https://twitter.com/theamaeestales/status/1741...,@qrsupport is there a problem with your app? I...,2023-12-31 19:51:00+00:00,0,0,[],"{'label': 'negative', 'score': 0.7817413}"
2,https://twitter.com/rvvaradan/status/174143019...,I have reported the incident. Hoping to get a ...,2023-12-31 12:04:00+00:00,1,3,[],"{'label': 'neutral', 'score': 0.6894269}"
3,https://twitter.com/ManojKa15016293/status/174...,Not settling dues for more than 3 years . Appr...,2023-12-31 10:19:00+00:00,2,2,[],"{'label': 'positive', 'score': 0.83495665}"
4,https://twitter.com/nkonialidis/status/1741377...,Kindly communicate better about the upcoming r...,2023-12-31 08:34:00+00:00,1,3,[],"{'label': 'neutral', 'score': 0.82786304}"


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65793 entries, 0 to 65792
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype              
---  ------     --------------  -----              
 0   link       65793 non-null  object             
 1   text       65793 non-null  object             
 2   date       65793 non-null  datetime64[ns, UTC]
 3   Likes      65793 non-null  int64              
 4   Comments   65793 non-null  int64              
 5   hashtags   65793 non-null  object             
 6   sentiment  65793 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(2), object(4)
memory usage: 3.5+ MB


# Preprocess Text for EDA

In [6]:
# https://github.com/kootenpv/contractions
import contractions

eg_str = df.loc[199, 'text']
print(wrap_text(eg_str))
print()
print(wrap_text(contractions.fix(eg_str)))

@qrsupport @qatarairways I’m genuinely disappointed that my exist seat that I booked
for my 16hrs outward flight tomorrow to LAX was given to another passenger when I
only changed the date of my return flight. I’ve been a privilege member for 10+
years. Wasn’t offered a solution.

@qrsupport @qatarairways I am genuinely disappointed that my exist seat that I booked
for my 16hrs outward flight tomorrow to LAX was given to another passenger when I
only changed the date of my return flight. I have been a privilege member for 10+
years. Was not offered a solution.


In [7]:
# Remove stopwords function
def remove_sw(text):
    tokenized_text = word_tokenize(text)
    # Remove stopwords and those words that is of length<=2
    processed_text = (word for word in tokenized_text if not word in stopwords and len(word) > 2)

    return " ".join(processed_text)

print(wrap_text(eg_str))
print()
print(wrap_text(remove_sw(contractions.fix(eg_str))))

@qrsupport @qatarairways I’m genuinely disappointed that my exist seat that I booked
for my 16hrs outward flight tomorrow to LAX was given to another passenger when I
only changed the date of my return flight. I’ve been a privilege member for 10+
years. Wasn’t offered a solution.

qrsupport qatarairways genuinely disappointed exist seat booked 16hrs outward flight
tomorrow LAX given another passenger changed date return flight privilege member 10+
years Was not offered solution


In [8]:
# Lemmatization
# https://stackoverflow.com/a/75215495/15937542
def lemmatize_pipe(text_col):
    
    docs = nlp.pipe(text_col)
    lemmatized_col = [lemmatize_text_spacy(doc) for doc in docs]

    return lemmatized_col


def lemmatize_text_spacy(doc):

    # Extract lemmatized words
    # The word refunded does not get lemmatized, hence adding this special condition
    lemmatized_words = (token.lemma_ if token.text != "refunded" else "refund" for token in doc)

    # Join the lemmatized words to form the lemmatized text
    lemmatized_text = ' '.join(lemmatized_words)

    return lemmatized_text

print(wrap_text(eg_str))
print()

doc=nlp(remove_sw(contractions.fix(eg_str)))
print(wrap_text(lemmatize_text_spacy(doc)))

@qrsupport @qatarairways I’m genuinely disappointed that my exist seat that I booked
for my 16hrs outward flight tomorrow to LAX was given to another passenger when I
only changed the date of my return flight. I’ve been a privilege member for 10+
years. Wasn’t offered a solution.

qrsupport qatarairways genuinely disappointed exist seat book 16hrs outward flight
tomorrow LAX give another passenger change date return flight privilege member 10 +
year be not offer solution


In [9]:
df = (
    df.drop_duplicates(subset='text', ignore_index=True)
    .assign(
        
        # Extract usernames
        users=lambda x: x['link'].str.extract(r'twitter\.com/(.*?)/status', expand=False),

        # For the `cleaned_text` column, first the `text` undergoes all preprocessing steps such as
        # removing user hanldes, urls, remove empty spaces, stopwords, etc.
        # Finally, the whole proprocessed `text` column is then inputted into the 
        # "lemmatize_pipe" function to process lemmatization quickly.
        cleaned_text=lambda x: lemmatize_pipe(
            # lower case
            x['text'].str.lower()
            
            # Remove all userhandles
            .str.replace(r'@\w+', ' ', regex=True)
            
            # Remove all url links
            .str.replace(r'https*.*? *|www\..*? *|bit\..*? *', " ", 
                         regex=True)

            # Apply Contractions function
            .apply(contractions.fix)

            # Remove everything other characters extept alphabets
            .str.replace(r'[^a-z]', ' ', regex=True)

            # Remove all extra spaces
            .str.replace('\s+', " ", regex=True)

            # Remove leading and trailing spaces
            .str.strip()

            # Remove stopwords
            .apply(remove_sw)
            ),
        
        # Create new column that contain sentiment label
        sntmnt_lbl=lambda x: (x['sentiment']
                        .apply(ast.literal_eval)
                        .apply(lambda x: x['label'])),

        # Create new column that contain score for sentiment label
        score=lambda x: (x['sentiment']
                        .apply(ast.literal_eval)
                        .apply(lambda x: x['score'])),
    )
    # drop sentiment column
    .drop('sentiment', axis=1)

    # remove those rows with just empty text after preprocessing
    .loc[lambda x:x['cleaned_text'].ne('')]
    .reset_index(drop=True)
)

df.head()

Unnamed: 0,link,text,date,Likes,Comments,hashtags,users,cleaned_text,sntmnt_lbl,score
0,https://twitter.com/juliet_gough/status/174158...,It was fantastic service onboard. I'm so impre...,2023-12-31 22:23:00+00:00,0,0,[],juliet_gough,fantastic service onboard impressed thank,positive,0.989524
1,https://twitter.com/theamaeestales/status/1741...,@qrsupport is there a problem with your app? I...,2023-12-31 19:51:00+00:00,0,0,[],theamaeestales,problem app not login account,negative,0.781741
2,https://twitter.com/rvvaradan/status/174143019...,I have reported the incident. Hoping to get a ...,2023-12-31 12:04:00+00:00,1,3,[],rvvaradan,report incident hope get resolution soon,neutral,0.689427
3,https://twitter.com/ManojKa15016293/status/174...,Not settling dues for more than 3 years . Appr...,2023-12-31 10:19:00+00:00,2,2,[],ManojKa15016293,not settle due year appreciate emirates airway...,positive,0.834957
4,https://twitter.com/nkonialidis/status/1741377...,Kindly communicate better about the upcoming r...,2023-12-31 08:34:00+00:00,1,3,[],nkonialidis,kindly communicate well upcoming rebooking,neutral,0.827863


In [10]:
# Compare original vs final preprocessed text
print(wrap_text(eg_str))
print()
print(wrap_text(df.loc[193, 'cleaned_text']))

@qrsupport @qatarairways I’m genuinely disappointed that my exist seat that I booked
for my 16hrs outward flight tomorrow to LAX was given to another passenger when I
only changed the date of my return flight. I’ve been a privilege member for 10+
years. Wasn’t offered a solution.

genuinely disappointed exist seat book hrs outward flight tomorrow lax give another
passenger change date return flight privilege member year not offer solution


## Keyword/phrase Extraction

### KeyBert

In [11]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
vectorizer = CountVectorizer(max_df=0.7, stop_words='english', ngram_range=(1, 2))


model = KeyBERT(model=sentence_model)

def extract_keywords(texts):

    kw_list = model.extract_keywords(
        texts,
        vectorizer=vectorizer,
        top_n=3,
        use_mmr=True,
        diversity=0.7
    )

    keywords=[",".join((k[0] for k in kw_tup if "qatar" not in k)) for kw_tup in kw_list]
    
    return keywords




In [13]:
batch_size=64
nbr_of_batches=int(np.ceil(len(df)/batch_size))

df_batches = np.array_split(df['cleaned_text'].to_list(), nbr_of_batches)

keyword_list=[]

for batch in tqdm(df_batches):
    keywords_from_batch = extract_keywords(texts=batch)
    keyword_list.extend(keywords_from_batch)

  0%|          | 0/991 [00:00<?, ?it/s]

In [14]:
keyword_list

['service onboard,fantastic service,impressed thank',
 'app login,account,problem',
 'report incident,hope resolution,soon',
 'appreciate emirates,settle year,airways respect',
 'upcoming rebooking,kindly communicate,communicate upcoming',
 'cancel flight,live chat,believe slowness',
 'unable log,flight,require character',
 'airline hope,trip advantage,new customer',
 'file ref,send file,thank send',
 'flight kathmandu,assistance message,miss',
 'baggage final,doh continue,member check',
 'seat pay,toddler jan,bulk row',
 'check airport,service cape,long srop',
 'yes response,till day,yes',
 'kindly respond,kindly,respond',
 'flight doha,compensation delay,right html',
 'seat airport,month advance,exit row',
 'help qatarairway,check mother,center number',
 'share',
 'booking qatar,remark club,add baby',
 'qatar airway,jan twice,confirm mean',
 'qatar airway,jan twice,confirm mean',
 'october bank,receive email,information twice',
 'stroller airport,infant,website say',
 'thailand trans

In [15]:
df['keybert_kw']=keyword_list

In [16]:
df.loc[:5, ['cleaned_text', 'keybert_kw']]

Unnamed: 0,cleaned_text,keybert_kw
0,fantastic service onboard impressed thank,"service onboard,fantastic service,impressed thank"
1,problem app not login account,"app login,account,problem"
2,report incident hope get resolution soon,"report incident,hope resolution,soon"
3,not settle due year appreciate emirates airway...,"appreciate emirates,settle year,airways respect"
4,kindly communicate well upcoming rebooking,"upcoming rebooking,kindly communicate,communic..."
5,not believe slowness cancel flight not inform ...,"cancel flight,live chat,believe slowness"


### Yake

In [17]:
from joblib import Parallel, delayed
import yake

# Initialize YAKE extractor outside the function for better performance
yake_kw = yake.KeywordExtractor(n=2, top=5, windowsSize=100)

def yake_extract_kw(text):
    try:
        # Extracting keywords 
        KeyWords = yake_kw.extract_keywords(text) 

        # Displaying top 5 keywords 
        keywords = ",".join((kw for kw, _ in KeyWords if "qatar" not in kw))
        return keywords
    
    except Exception as e:
        print(f"Error processing text: {text}")
        print(f"Error details: {e}")
        return ""

# https://stackoverflow.com/questions/42220458/what-does-the-delayed-function-do-when-used-with-joblib-in-python
def process_texts_parallel(texts, n_jobs=-1):
    results = Parallel(n_jobs=n_jobs)(delayed(yake_extract_kw)(text) for text in texts)
    return results

In [18]:
df['yake_kw']=process_texts_parallel(df['cleaned_text'].tolist())

In [19]:
df.to_csv("qatarairways_tweets_sentiments_with_keywords.csv", index=False)