# General workflow
from sklearn.decomposition import NMF 

nmf_model =  NMF(n_components=2, random_state=42) # it is best practice to start with n_component = 2 and increase by one at a time

W = nmf_model.fit_transform(X)

H = nmf_model.components_

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import text_preprocessing

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('airlines_reviews.csv')
df.head()

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,9,yes
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,3,no
2,Food was plentiful and tasty,S Han,2024-02-20,Singapore Airlines,True,Excellent performance on all fronts. I would...,Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,1,5,2,1,5,10,yes
3,“how much food was available,D Laynes,2024-02-19,Singapore Airlines,True,Pretty comfortable flight considering I was f...,Solo Leisure,February 2024,Singapore to London Heathrow,Economy Class,5,5,5,5,5,10,yes
4,“service was consistently good”,A Othman,2024-02-19,Singapore Airlines,True,The service was consistently good from start ...,Family Leisure,February 2024,Singapore to Phnom Penh,Economy Class,5,5,5,5,5,10,yes


In [4]:
df['clean_reviews'] = text_preprocessing.clean_normalize(df['Reviews'])

In [5]:
df.head(2)

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended,clean_reviews
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,9,yes,Flight amazing crew onboard flight welcomin...
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,3,no,book emergency exit seat mean huge discomfo...


In [6]:
df.shape

(8100, 18)

In [7]:
# use Tfidfvectorizer to come up with topics
# in topic modeling we want unique terms
tv = TfidfVectorizer(stop_words='english', min_df=.05, max_df=.2) # tune params to get terms that would help to come up with topic names
Xt = tv.fit_transform(df.clean_reviews)
Xt_df = pd.DataFrame(Xt.toarray(), columns = tv.get_feature_names_out())
Xt_df

Unnamed: 0,10,a380,able,air,aircraft,airlines,airways,allow,arrival,arrive,...,use,ve,wait,want,water,way,wifi,wine,work,year
0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.0,0.000000,0.367415,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.0,0.000000,0.454537,0.0,0.0,0.000000,0.000000,0.0,...,0.509564,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.0,0.000000,0.167996,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.209513,0.000000,0.184545,0.000000
4,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8095,0.000000,0.000000,0.0,0.462304,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
8096,0.187139,0.000000,0.0,0.145380,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.166391
8097,0.000000,0.000000,0.0,0.402975,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.152897,0.0,0.0,0.000000,0.000000,0.000000,0.000000
8098,0.000000,0.190034,0.0,0.280665,0.000000,0.0,0.0,0.191678,0.182549,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000


now create NMF model

In [8]:
from sklearn.decomposition import NMF

In [9]:
nmf = NMF(n_components=2)
W = nmf.fit_transform(Xt_df) # documents - topics
H = nmf.components_ # topics - terms
H.shape
# H has 177 cols because Xt_df has 177 cols



(2, 177)

In [10]:
# we would like to see actual terms that are associated with each topic
# wrap around function
def display_topics(H, num_words=10):
    for topic_num, topic_array in enumerate(H):
        top_features = topic_array.argsort()[::-1][:num_words] # argsort() function sorts in a normal way but instead of sorting values itself it returns the indices
        top_words = [tv.get_feature_names_out()[i] for i in top_features]
        print('Topic', topic_num+1, ":", ", ".join(top_words))


In [11]:
display_topics(H)

Topic 1 : doha, qatar, economy, singapore, great, comfortable, excellent, airways, lounge, serve
Topic 2 : turkish, istanbul, ticket, customer, tell, airlines, day, pay, say, ask


# Tune an NMF model
The goal is to take the basic model and try out different numbers of components or topics and take a look at how the results look, and see if we can continue to tweak them to make the topics make more sense. 

In [12]:
nmf = NMF(n_components=3)
W = nmf.fit_transform(Xt_df)
H = nmf.components_

In [13]:
def display_topics(H, num_words =10):
    for topic_num, topic_array in enumerate(H):
        top_features = topic_array.argsort()[::-1][:num_words]
        top_words = [tv.get_feature_names_out()[i] for i in top_features]
        print("Topic", topic_num+1, ":", ', '.join(top_words))

In [14]:
display_topics(H)

Topic 1 : singapore, economy, great, serve, comfortable, drink, entertainment, excellent, hong, kong
Topic 2 : turkish, istanbul, ticket, customer, tell, airlines, day, say, pay, luggage
Topic 3 : qatar, doha, airways, lounge, thank, great, travel, comfortable, aircraft, excellent


In [15]:
nmf = NMF(n_components=4)
W = nmf.fit_transform(Xt_df)
H = nmf.components_

In [16]:
display_topics(H)

Topic 1 : singapore, economy, great, serve, comfortable, drink, hong, kong, entertainment, excellent
Topic 2 : turkish, istanbul, ticket, customer, airlines, tell, day, delay, luggage, say
Topic 3 : qatar, doha, airways, lounge, thank, great, travel, comfortable, excellent, aircraft
Topic 4 : emirates, dubai, a380, travel, ask, year, pay, bad, london, lounge


In [17]:
nmf = NMF(n_components=5, random_state=42, max_iter=500)
W = nmf.fit_transform(Xt_df)
H = nmf.components_


In [18]:
display_topics(H)

Topic 1 : singapore, economy, great, serve, comfortable, hong, kong, drink, entertainment, excellent
Topic 2 : turkish, istanbul, ticket, airlines, customer, tell, day, change, say, pay
Topic 3 : qatar, doha, airways, lounge, thank, great, travel, comfortable, aircraft, excellent
Topic 4 : emirates, dubai, travel, a380, ask, pay, bad, check, year, book
Topic 5 : air, france, paris, bag, luggage, delay, baggage, check, arrive, day


In [24]:
doc_topics = pd.DataFrame(W) # every row is a review every column is a topic
doc_topics.columns = ['Singapore', 'Turkish_Airlines', 'Qatar_Airways', 'Emirates', 'Air_France']


In [27]:
df.head(2)

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended,clean_reviews
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,9,yes,Flight amazing crew onboard flight welcomin...
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,3,no,book emergency exit seat mean huge discomfo...


In [25]:
doc_topics

Unnamed: 0,Singapore,Turkish_Airlines,Qatar_Airways,Emirates,Air_France
0,0.031697,0.000000,0.004261,0.000000,0.002480
1,0.020325,0.014977,0.006685,0.010079,0.008766
2,0.026849,0.000000,0.014085,0.002641,0.000000
3,0.063159,0.000000,0.000000,0.000000,0.000000
4,0.030470,0.004610,0.011087,0.007673,0.004904
...,...,...,...,...,...
8095,0.022590,0.000000,0.008109,0.001569,0.102289
8096,0.053620,0.009736,0.017257,0.009723,0.042543
8097,0.032177,0.000000,0.000000,0.012636,0.086348
8098,0.023989,0.003890,0.017592,0.012830,0.118584


In [28]:
# bring this topic information down to data
pd.concat([df.Reviews, doc_topics], axis =1) # see topics and reviews side by side


Unnamed: 0,Reviews,Singapore,Turkish_Airlines,Qatar_Airways,Emirates,Air_France
0,Flight was amazing. The crew onboard this fl...,0.031697,0.000000,0.004261,0.000000,0.002480
1,Booking an emergency exit seat still meant h...,0.020325,0.014977,0.006685,0.010079,0.008766
2,Excellent performance on all fronts. I would...,0.026849,0.000000,0.014085,0.002641,0.000000
3,Pretty comfortable flight considering I was f...,0.063159,0.000000,0.000000,0.000000,0.000000
4,The service was consistently good from start ...,0.030470,0.004610,0.011087,0.007673,0.004904
...,...,...,...,...,...,...
8095,"KE124, Brisbane to Incheon (A330) and KE867,...",0.022590,0.000000,0.008109,0.001569,0.102289
8096,Our recent flight was our fourth trip to the...,0.053620,0.009736,0.017257,0.009723,0.042543
8097,I flew Korean Air from Bali to Seoul in Pres...,0.032177,0.000000,0.000000,0.012636,0.086348
8098,Seoul to Paris with Korean Air. I am traveli...,0.023989,0.003890,0.017592,0.012830,0.118584


In [29]:
reviews_topics =pd.concat([df.Reviews, doc_topics], axis =1) 
reviews_topics

Unnamed: 0,Reviews,Singapore,Turkish_Airlines,Qatar_Airways,Emirates,Air_France
0,Flight was amazing. The crew onboard this fl...,0.031697,0.000000,0.004261,0.000000,0.002480
1,Booking an emergency exit seat still meant h...,0.020325,0.014977,0.006685,0.010079,0.008766
2,Excellent performance on all fronts. I would...,0.026849,0.000000,0.014085,0.002641,0.000000
3,Pretty comfortable flight considering I was f...,0.063159,0.000000,0.000000,0.000000,0.000000
4,The service was consistently good from start ...,0.030470,0.004610,0.011087,0.007673,0.004904
...,...,...,...,...,...,...
8095,"KE124, Brisbane to Incheon (A330) and KE867,...",0.022590,0.000000,0.008109,0.001569,0.102289
8096,Our recent flight was our fourth trip to the...,0.053620,0.009736,0.017257,0.009723,0.042543
8097,I flew Korean Air from Bali to Seoul in Pres...,0.032177,0.000000,0.000000,0.012636,0.086348
8098,Seoul to Paris with Korean Air. I am traveli...,0.023989,0.003890,0.017592,0.012830,0.118584


# Combine Machine Learning techniques
Topic Modelling, Sentiment Analysis and EDA


In [32]:
# add on another column which is going to assign a single topic to the text
reviews_topics['top_topic'] = reviews_topics.iloc[:,1:].idxmax(axis=1)
reviews_topics.head(3)

Unnamed: 0,Reviews,Singapore,Turkish_Airlines,Qatar_Airways,Emirates,Air_France,top_topic
0,Flight was amazing. The crew onboard this fl...,0.031697,0.0,0.004261,0.0,0.00248,Singapore
1,Booking an emergency exit seat still meant h...,0.020325,0.014977,0.006685,0.010079,0.008766,Singapore
2,Excellent performance on all fronts. I would...,0.026849,0.0,0.014085,0.002641,0.0,Singapore


In [33]:
# add sentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
def get_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    return analyzer.polarity_scores(text)['compound']

In [34]:
reviews_topics['sentiment'] = reviews_topics.Reviews.apply(get_sentiment)
reviews_topics.head(2)

Unnamed: 0,Reviews,Singapore,Turkish_Airlines,Qatar_Airways,Emirates,Air_France,top_topic,sentiment
0,Flight was amazing. The crew onboard this fl...,0.031697,0.0,0.004261,0.0,0.00248,Singapore,0.9754
1,Booking an emergency exit seat still meant h...,0.020325,0.014977,0.006685,0.010079,0.008766,Singapore,-0.8957


# EDA
For every topic find the average sentiment score so that we will know how customers feel about that topic


In [36]:
reviews_topics.groupby('top_topic')['sentiment'].mean()


top_topic
Air_France          0.222661
Emirates            0.128099
Qatar_Airways       0.581642
Singapore           0.592219
Turkish_Airlines   -0.188684
Name: sentiment, dtype: float64