In [64]:
import numpy as np
import pandas as pd
import nltk
import re
from re import sub
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import f1_score, recall_score, accuracy_score, make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier


In [36]:
full_df = pd.read_csv('../../data/travel_full.csv')
en_df = full_df.dropna(subset=['Comment', "Tags"])
en_df = en_df[en_df["Lang"]=='EN']

In [37]:
def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words. 
    Adapted from the code of Rafał Wójcik: https://towardsdatascience.com/unsupervised-sentiment-analysis-a38bf1906483'''
    
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [38]:
cleaned_df = en_df.copy()
cleaned_df.Comment = cleaned_df.Comment.apply(lambda x: text_to_word_list(x))
cleaned_df.Comment = cleaned_df.Comment.str.join(' ')
cleaned_df.head()

Unnamed: 0,Unique ID,Date,URL,Page title,Comment,Tags,Refining details,Status,What's wrong,Lang,Tags confirmed
0,60a063c82b6da5147c289783,"16 May, 2021",travel.gc.ca/travel-covid/travel-restrictions/...,Flying to Canada: COVID-19 testing for travell...,quero viajar de portugal para a inglaterra,Restrictions or Requirements,Travel outside Canada,New,Other reason,EN,checked
1,60a067a62b6da5147c28978d,"16 May, 2021",travel.gc.ca/travel-covid,"""COVID-19: Travel, testing, quarantine and bor...",if one is cleared of all tests and given the o...,Quarantine,Have been vaccinated,New,,EN,checked
2,60a06bde2b6da5147c289790,"16 May, 2021",travel.gc.ca/travel-covid/travel-restrictions/...,Mandatory quarantine or isolation – Travel res...,why if fully vaccinated would you need to quar...,Quarantine,Have been vaccinated,New,The information is hard to understand,EN,checked
3,60a06bfc2b6da5147c289791,"16 May, 2021",travel.gc.ca/travel-covid/travel-restrictions/...,Find out if you can travel to Canada - Citizen...,can i land in the us and cross border by car,Restrictions or Requirements,Driving - By land or water,New,,EN,checked
4,60a06c382b6da5147c289792,"16 May, 2021",travel.gc.ca/travel-covid/travel-restrictions/...,Mandatory quarantine or isolation – Travel res...,i m trying to figure out if i can quarantine o...,Quarantine,Just main topic,New,I can't find the information,EN,checked


In [39]:
#cleaned_df = cleaned_df[['Comment', 'Tags']]

In [40]:
train_df, test_df = train_test_split(cleaned_df, test_size = 0.15, random_state=123)

In [41]:
## Adapted from https://github.com/alpha-canada-ca/feedback-classification-retroaction/blob/master/process_feedback.py
#function to clean the word of any punctuation or special characters
def cleanPunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

#function to convert to lowercase
def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent


#function to stem feedbck (English)
stemmer_en = SnowballStemmer("english")
def stemming_en(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer_en.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

#apply pre-process functions to English
train_df.Comment = train_df.Comment.str.lower()
train_df.Comment = train_df.Comment.apply(cleanPunc)
train_df.Comment = train_df.Comment.apply(keepAlpha)
train_df.Comment = train_df.Comment.apply(stemming_en)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [42]:
def prepare_for_classification(train,dev,max_n=3):
    '''convert lists of reviews train and dev to spare feature matrices X_train and X_test,
      and lists of polarity classifications train_class and dev_class'''
    vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,max_n), norm='l2')
    train_texts = train.Comment.values
    train_class = train.Tags.values
    dev_texts = dev.Comment.values
    dev_class = dev.Tags.values
    X_train = vectorizer.fit_transform(train_texts)
    X_dev = vectorizer.transform(dev_texts)
    
    return X_train,train_class, X_dev,dev_class

In [72]:
def evaluate(train, test, n = 3):
    """Calculate the the f1 score from given train test data set, and n grams"""
    
    X_train,train_class, X_test,test_class = prepare_for_classification(train,test,max_n=n)
    clf = OneVsRestClassifier(MultinomialNB(alpha=0.3, fit_prior=True, class_prior=None))
    clf.fit(X_train,train_class)
    
    scores = cross_validate(clf, X_train, train_class, return_train_score=True)
    fscore = f1_score(test_class, clf.predict(X_test), average='macro')
    accuracy = accuracy_score(test_class, clf.predict(X_test))
    
    print(f"The shape of training set is {X_train.shape}, the fscore is {round(fscore, 4)}, the accuracy is {round(accuracy, 4)}")
    return pd.DataFrame(scores)

In [73]:
evaluate(train_df, test_df, 3)



The shape of training set is (10591, 168755), the fscore is 0.4306, the accuracy is 0.549


Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.124001,0.026998,0.636621,0.95314
1,0.152999,0.032039,0.626062,0.954326
2,0.148049,0.029988,0.633617,0.951375
3,0.144035,0.030015,0.620869,0.954562
4,0.124983,0.028016,0.640699,0.952083


In [45]:
X_train,train_class, X_test,test_class = prepare_for_classification(train_df,test_df,max_n=3)
clf = OneVsRestClassifier(MultinomialNB(alpha=0.3, fit_prior=True, class_prior=None))
clf.fit(X_train,train_class)

OneVsRestClassifier(estimator=MultinomialNB(alpha=0.3))

In [46]:
sample_comments = test_df[['Comment', 'Tags']][:100]
sample_comments['Preds'] = list(clf.predict(X_test)[:100])

In [47]:
sample_comments.to_csv('../data/travel_error_analysis_NB.csv')

In [None]:
accuracy_score(test_class, clf.predict(X_test))