# Part 1: Load Data

In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore",category=DeprecationWarning)
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns', 20)


In [2]:
peers = pd.read_csv("peers.csv")
germanwings = peers[peers.airline_code=="germanwings"].drop("germanwings",axis=1)

In [3]:
germanwings.head(1)

Unnamed: 0,cabin_service,country,date_flown,date_published,ground_service,id,name,rating,recommended,review_count,...,seat_comfort,seat_type,text,text_header,traveller_type,value_for_money,airline_name,airline_code,days_after_crash,review
3626,3.0,Netherlands,2019-06-01 00:00:00,2019-07-01,3.0,anchor667978,Sander van Kan,7.0,1.0,8,...,4.0,Economy Class,Dusseldorf to Berlin. Eurowings flight operat...,Seat was fine with enough legroom.,Couple Leisure,5.0,Germanwings,germanwings,1560,Seat was fine with enough legroom. Dusseldorf...


## Training a model

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(germanwings.review, 
                                                    germanwings.recommended, 
                                                    random_state=0)

print('Number of rows in the total set: {}'.format(germanwings.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 146
Number of rows in the training set: 109
Number of rows in the test set: 37


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

In [6]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)
predictions = naive_bayes.predict(testing_data)

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(predictions,y_test)))
print('Precision score: ', format(precision_score(predictions,y_test)))
print('Recall score: ', format(recall_score(predictions,y_test)))
print('F1 score: ', format(f1_score(predictions,y_test)))

Accuracy score:  0.7297297297297297
Precision score:  0.6
Recall score:  0.8571428571428571
F1 score:  0.7058823529411764


In [9]:
pos_class_prob_sorted = naive_bayes.feature_log_prob_[0, :].argsort()
neg_class_prob_sorted = naive_bayes.feature_log_prob_[1, :].argsort()

print("Top 5 words predicting to recommend:\n" , np.take(count_vector.get_feature_names(), pos_class_prob_sorted[:5]))
print("\nTop emotions = emotion_analyzer_documents(pressed_reviews)
emotions5 words predicting to not recommend:\n", np.take(count_vector.get_feature_names(), neg_class_prob_sorted[:5]))


Top 5 words predicting to recommend:
 ['compartments' 'satisfied' 'saturday' 'move' 'saved']

Top 5 words predicting to not recommend:
 ['00' 'landing' 'lane' 'language' 'lasts']
