IMPORTING THE LIBRARIES

In [61]:
import pandas as pd
import spacy
import string
import re
import unicodedata
import contractions
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Load English language model for spaCy
nlp = spacy.load('en_core_web_sm')

# Define the list of stopwords
STOP_WORDS = set(stopwords.words('english'))
                 
custom_stopwords = set(['ryanair', 'flight', 'airport', 'us', 'one', 'airline' 'would','get','told', 'could', 'even', 'got', 'also', 'another', 'ever', 'like', 'way', 'go', 'asked', 'every', '2', 'plane'])
STOP_WORDS = STOP_WORDS.union(custom_stopwords)

# Define the function to expand contractions
def expand(x):
    """Expand contractions in a sentence."""
    return contractions.fix(x)

def cleaning(text):        
    # converting to lowercase, removing URL links, special characters, punctuations...
    text = text.lower() # converting to lowercase
    text = re.sub('https?://\S+|www\.\S+', '', text) # removing URL links
    text = re.sub(r"\b\d+\b", "", text) # removing number 
    text = re.sub('<.*?>+', '', text) # removing special characters, 
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # punctuations
    text = re.sub('\n', '', text)
    text = re.sub('[’“”…]', '', text)
   
    #removing emoji: 
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)   

   # removing short form: 
    
    text=re.sub("isn't",'is not',text)
    text=re.sub("he's",'he is',text)
    text=re.sub("wasn't",'was not',text)
    text=re.sub("there's",'there is',text)
    text=re.sub("couldn't",'could not',text)
    text=re.sub("won't",'will not',text)
    text=re.sub("they're",'they are',text)
    text=re.sub("she's",'she is',text)
    text=re.sub("There's",'there is',text)
    text=re.sub("wouldn't",'would not',text)
    text=re.sub("haven't",'have not',text)
    text=re.sub("That's",'That is',text)
    text=re.sub("you've",'you have',text)
    text=re.sub("He's",'He is',text)
    text=re.sub("what's",'what is',text)
    text=re.sub("weren't",'were not',text)
    text=re.sub("we're",'we are',text)
    text=re.sub("hasn't",'has not',text)
    text=re.sub("you'd",'you would',text)
    text=re.sub("shouldn't",'should not',text)
    text=re.sub("let's",'let us',text)
    text=re.sub("they've",'they have',text)
    text=re.sub("You'll",'You will',text)
    text=re.sub("i'm",'i am',text)
    text=re.sub("we've",'we have',text)
    text=re.sub("it's",'it is',text)
    text=re.sub("don't",'do not',text)
    text=re.sub("that´s",'that is',text)
    text=re.sub("I´m",'I am',text)
    text=re.sub("it’s",'it is',text)
    text=re.sub("she´s",'she is',text)
    text=re.sub("he’s'",'he is',text)
    text=re.sub('I’m','I am',text)
    text=re.sub('I’d','I did',text)
    text=re.sub("he’s'",'he is',text)
    text=re.sub('there’s','there is',text)
    
     
    return text


# Define the function to remove accented characters
def remove_accented_chars(x):
    """Remove accented characters from a sentence."""
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x

# Define the function to lemmatize words
def make_to_base(x):
    """Lemmatize words in a sentence."""
    x_list = []
    doc = nlp(x)
    for token in doc:
        lemma = str(token.lemma_)
        # Retain original text for pronouns and 'be'
        if lemma == '-PRON-' or lemma == 'be':
            lemma = token.text
        x_list.append(lemma)
    return " ".join(x_list)

# # Define the preprocessing function
# def preprocess(df, d):
#     """Preprocess a given document."""
#     df[d] = df[d].apply(lambda x: x.lower())
#     df[d] = df[d].apply(expand)
#     df[d] = df[d].apply(lambda x: re.sub('[^A-Za-z0-9\s-]+', '', x))
#     df[d] = df[d].apply(lambda x: " ".join(x.split()))
#     df[d] = df[d].apply(remove_accented_chars)
#     df[d] = df[d].apply(make_to_base)
#     df[d] = df[d].apply(lambda x: " ".join([t for t in x.split() if t not in STOP_WORDS]))

#     # Count the frequency of each word
#     word_counts = Counter(" ".join(df[d]).split())
    
#     # Identify the top 5 most frequent words
#     top_5_words = [word for word, _ in word_counts.most_common(10)]
    
#     print(top_5_words)

#     # Remove the top 5 most frequent words
#     df[d] = df[d].apply(lambda x: " ".join([t for t in x.split() if t not in top_5_words]))


# Load the dataset
df = pd.read_csv('C:\\Users\\aashi\\Desktop\\GA Tech docs\\Seventh Sem Spring 24\\ISYE 6740 - Computational Data Analysis\\Project\\Our Project\\data\\data\\ryanair_reviews_sentiments_v1.csv')

# Selecting only columns 'A' and 'B'
df_selected_cols = df[['Comment title', 'Comment', 'Sentiment']]

# Combine two columns with a period in between
df_selected_cols['Review'] = df_selected_cols['Comment title'] + '. ' + df_selected_cols['Comment']

# Drop the 'first_part' and 'second_part' columns
df_selected_cols = df_selected_cols.drop(['Comment title', 'Comment'], axis=1)

# # Preprocess the comments
# preprocess(df_selected_cols, 'Review')

df_selected_cols['Review'] = df_selected_cols['Review'].apply(cleaning)

# remove stop word: 
df_selected_cols['Cust_review'] = df_selected_cols['Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (STOP_WORDS)]))

df_selected_cols = df_selected_cols[df_selected_cols.Sentiment != 'Neutral'].drop(columns=['Review'])

print(df_selected_cols.head())




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


  Sentiment                                        Cust_review
0  Positive  bang time smooth flights flew back faro london...
1  Positive  good affordable good affordable time pleasant ...
2  Positive  really impressed really impressed pay cost £ s...
3  Positive  decent offering review faro liverpool booked s...
4  Positive  cabin crew welcoming friendly left gate ahead ...


In [62]:
# print(df_selected_cols[df_selected_cols['Sentiment']=='Negative'])

df_selected_cols['Sentiment'].value_counts()

Positive    1420
Negative     804
Name: Sentiment, dtype: int64

In [58]:
# #Working with the most Frequent Words: 
# from collections import Counter
# cnt = Counter()
# for text in df_selected_cols["no_sw"].values:
#     for word in text.split():
#         cnt[word] += 1
# cnt.most_common(10)
# temp = pd.DataFrame(cnt.most_common(10))
# temp.columns=['word', 'count']
# temp

Unnamed: 0,word,count
0,time,1362
1,customer,1208
2,boarding,1161
3,staff,991
4,airline,953
5,pay,929
6,check,916
7,service,885
8,would,880
9,crew,772


In [52]:
# # Remove the most frequent words:
# FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
# def remove_freqwords(text):
#     """custom function to remove the frequent words"""
#     return " ".join([word for word in str(text).split() if word not in FREQWORDS])
# df_selected_cols["wo_stopfreq"] = df_selected_cols["no_sw"].apply(lambda text: remove_freqwords(text))
# df_selected_cols.head()

Unnamed: 0,Sentiment,Review,no_sw,wo_stopfreq
0,Positive,bang on time and smooth flights flew back from...,bang time smooth flights flew back faro london...,bang smooth flights flew back faro london luto...
1,Positive,another good affordable flight another good af...,another good affordable flight another good af...,another good affordable another good affordabl...
2,Positive,really impressed really impressed you get what...,really impressed really impressed get pay flig...,really impressed really impressed get cost £ s...
3,Positive,a decent offering from ryanair i should like t...,decent offering ryanair like review flight far...,decent offering like review faro liverpool boo...
4,Positive,cabin crew were welcoming and friendly flight ...,cabin crew welcoming friendly flight left gate...,cabin crew welcoming friendly left gate ahead ...


In [63]:
# create the cleaned data for the train-test split:
df_selected_cols.columns=['Sentiment','Cust_review']
df_selected_cols.Sentiment = [0 if each == "Negative" else 1 for each in df_selected_cols.Sentiment]
df_selected_cols

Unnamed: 0,Sentiment,Cust_review
0,1,bang time smooth flights flew back faro london...
1,1,good affordable good affordable time pleasant ...
2,1,really impressed really impressed pay cost £ s...
3,1,decent offering review faro liverpool booked s...
4,1,cabin crew welcoming friendly left gate ahead ...
...,...,...
2244,1,customer review daughter took holiday kos neve...
2245,0,customer review stansted pula tried adhere rul...
2246,1,customer review printing boarding tickets outb...
2247,1,customer review budapest manchester back month...


In [64]:
tokenized_review=df_selected_cols['Cust_review'].apply(lambda x: x.split())
tokenized_review.head()

0    [bang, time, smooth, flights, flew, back, faro...
1    [good, affordable, good, affordable, time, ple...
2    [really, impressed, really, impressed, pay, co...
3    [decent, offering, review, faro, liverpool, bo...
4    [cabin, crew, welcoming, friendly, left, gate,...
Name: Cust_review, dtype: object

In [73]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(df_selected_cols['Cust_review'])


In [66]:
from sklearn.model_selection import train_test_split
import numpy as np

# Get the number of rows (observations) in the sparse matrix
num_rows = text_counts.shape[0]

# Create shuffled indices
shuffled_indices = np.random.permutation(num_rows)

# Use shuffled indices to shuffle the data
X_shuffled = text_counts[shuffled_indices]
y_shuffled = df_selected_cols['Sentiment'].iloc[shuffled_indices]

# Split the shuffled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_shuffled, y_shuffled, test_size=0.20, random_state=42)


In [68]:
y_train.value_counts()
y_test.value_counts()

1    286
0    159
Name: Sentiment, dtype: int64

In [69]:
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report, confusion_matrix
CNB = ComplementNB()
CNB.fit(X_train, y_train)

from sklearn import metrics
predicted = CNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('ComplementNB model accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

ComplementNB model accuracy is 74.61%
------------------------------------------------
Confusion Matrix:
     0    1
0  103   56
1   57  229
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.65      0.65       159
           1       0.80      0.80      0.80       286

    accuracy                           0.75       445
   macro avg       0.72      0.72      0.72       445
weighted avg       0.75      0.75      0.75       445



In [71]:
from sklearn.naive_bayes import MultinomialNB

MNB = MultinomialNB()
MNB.fit(X_train, y_train)

predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('MultinominalNB model accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

MultinominalNB model accuracy is 73.71%
------------------------------------------------
Confusion Matrix:
    0    1
0  98   61
1  56  230
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.62      0.63       159
           1       0.79      0.80      0.80       286

    accuracy                           0.74       445
   macro avg       0.71      0.71      0.71       445
weighted avg       0.74      0.74      0.74       445



In [72]:
from sklearn.naive_bayes import BernoulliNB

BNB = BernoulliNB()
BNB.fit(X_train, y_train)

predicted = BNB.predict(X_test)
accuracy_score_bnb = metrics.accuracy_score(predicted,y_test)

print('BernoulliNB model accuracy = ' + str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

BernoulliNB model accuracy = 69.21%
------------------------------------------------
Confusion Matrix:
    0    1
0  74   85
1  52  234
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.47      0.52       159
           1       0.73      0.82      0.77       286

    accuracy                           0.69       445
   macro avg       0.66      0.64      0.65       445
weighted avg       0.68      0.69      0.68       445

