### Importing data

In [1]:
from google.colab import drive
# drive.mount._DEBUG = True
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv('/content/drive/My Drive/UTD Assignments/NLP/Assignment 1/train.csv',encoding = 'latin1')
test = pd.read_csv('/content/drive/My Drive/UTD Assignments/NLP/Assignment 1/test.csv',encoding = 'latin1')

### Tokenization

In [4]:
import re
import nltk
from nltk.tokenize import (BlanklineTokenizer, RegexpTokenizer, 
                           WordPunctTokenizer, TreebankWordTokenizer, 
                           TweetTokenizer)
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
def tokenize(text, type_tokenize=0, strip_handles=False, reduce_len = False, preserve_case = False):

  if type_tokenize == 0:
    # tokenize on \n
    tokenizer = BlanklineTokenizer()
    
  if type_tokenize == 1:
    # tokenize on \n, but better than 1
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    
  if type_tokenize == 2:  
    # remooving punctuations??
    tokenizer = RegexpTokenizer('\w+\.?\w?|\$[\d\.]+|\S+')
  
  if type_tokenize == 3:
    # Tokenize a text into a sequence of alphabetic and non-alphabetic characters, using the regexp \w+|[^\w\s]+.
    tokenizer = WordPunctTokenizer()

  if type_tokenize == 4:
    # The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. This is the method 
    # that is invoked by word_tokenize(). It assumes that the text has already been segmented into sentences, 
    # e.g. using sent_tokenize().
    tokenizer = TreebankWordTokenizer()

  if type_tokenize == 5:
    # Return a tokenized copy of *text*, using NLTK's recommended word tokenizer (currently an improved :class:.
    # TreebankWordTokenizer along with :class:.PunktSentenceTokenizer for the specified language).
    return nltk.word_tokenize(text, language="english")

  if type_tokenize == 6:
    # specially for tweets, keeps emoticons, hashtags, urls, @mentions intact
    tokenizer = TweetTokenizer(strip_handles=strip_handles, reduce_len=reduce_len, preserve_case = preserve_case)

  if type_tokenize == 7:
    # exactly same as 6
    return nltk.casual_tokenize(text, strip_handles=strip_handles, reduce_len=reduce_len)

  if type_tokenize == 8:
    
    regexes=(
        # Keep usernames together (any token starting with @, followed by A-Z, a-z, 0-9)
        r"(?:@[\w_]+)",

        # Keep hashtags together (any token starting with #, followed by A-Z, a-z, 0-9, _, or -)
        r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",

        # Keep words with apostrophes, hyphens and underscores together
        r"(?:[a-z][a-z’'\-_]+[a-z])",

        # Keep all other sequences of A-Z, a-z, 0-9, _ together
        r"(?:[\w_]+)",

        # Everything else that's not whitespace
        # r"(?:\S)"
    )

    big_regex="|".join(regexes)

    my_extensible_tokenizer = re.compile(big_regex, re.VERBOSE | re.I | re.UNICODE)

    def my_extensible_tokenize(text):
        for f in re.findall("([A-Z]+)", text):
          text = text.replace(f, f.lower())
        return my_extensible_tokenizer.findall(text)

    return my_extensible_tokenize(text)

  return tokenizer.tokenize(text)

### Stemming

In [6]:
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [7]:
def stemmer(text, type_stemmer, blank):
  if type_stemmer == 'porter':
    stemmer = PorterStemmer()
    porter_token = [stemmer.stem(token) for token in text]
    return porter_token
  
  if type_stemmer == 'snowball':
    stemmer = SnowballStemmer(language='english')
    snowball_token = [stemmer.stem(token) for token in text]
    return snowball_token

  if type_stemmer == 'lancaster':
    stemmer = LancasterStemmer()
    lancaster_token = [stemmer.stem(token) for token in text]
    return lancaster_token

### Lemmatization

In [8]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [9]:
def lemma(text, type_lemma, pos):
  if type_lemma == 'wordnet':
    lemmatizer = WordNetLemmatizer()
    lemmatized_output = [lemmatizer.lemmatize(token, pos = pos) for token in text]
    return lemmatized_output

### N-grams

In [10]:
from nltk.util import ngrams

In [11]:
def n_grams(text, n_ngrams, blank):
  ngrams_text = list(ngrams(text, n_ngrams))
  return ngrams_text

### Stop words

In [12]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
def remove_stopwords(text):
  non_stop = [token for token in text if token not in stop]
  return non_stop
stop = set(stopwords.words('english'))

### Extract atmentions and hashtags


In [14]:
def extract_special(text, extract_type):
    
    if isinstance(train['text1'][0], str):
      return [token.replace(extract_type, "").lower() for token in set(text.split()) if extract_type in token]
    elif isinstance(train['text1'][0], list):
      return [token.replace(extract_type, "").lower() for token in set(text) if extract_type in token]

### Data Preprocessing

Repeat for test

In [15]:
train['text1'] = train['text'].apply(tokenize, args = (8,True, True))
train['atmentions'] = train['text1'].apply(extract_special, args = ('@'))
train['hashtags'] = train['text1'].apply(extract_special, args = ('#'))
train['atmentions_count'] = train['atmentions'].apply(len)
train['hashtags_count'] = train['hashtags'].apply(len)

In [16]:
test['text1'] = test['text'].apply(tokenize, args = (8,True, True))
test['atmentions'] = test['text1'].apply(extract_special, args = ('@'))
test['hashtags'] = test['text1'].apply(extract_special, args = ('#'))
test['atmentions_count'] = test['atmentions'].apply(len)
test['hashtags_count'] = test['hashtags'].apply(len)

In [17]:
# min(train.atmentions.apply(len)), min(train.hashtags.apply(len))
# train[train['atmentions'].apply(len)>1][['atmentions']]
# unique_atmentions = set([inner for outer in train['atmentions'] for inner in outer])
unique_atmentions = {}
for outer in train['atmentions']:
  for inner in outer:
    if inner.lower() in unique_atmentions.keys():
      unique_atmentions[inner.lower()] +=1
    else:
      unique_atmentions[inner.lower()] = 1

unique_hashtags = {}
for outer in train['hashtags']:
  for inner in outer:
    if inner.lower() in unique_hashtags.keys():
      unique_hashtags[inner.lower()] +=1
    else:
      unique_hashtags[inner.lower()] = 1

In [18]:
# " ".join(sorted(unique_atmentions, key=unique_atmentions.get, reverse = True)[:30])
airlines = ['united', 'americanair', 'usairways', 'southwestair', 'jetblue', 'virginamerica', 'delta', 'aircanada', 'staralliance', 'virginatlantic', 'spiritairlines', 'silverairways','lufthansa']

In [19]:
# " ".join(sorted(unique_hashtags, key=unique_hashtags.get, reverse = True)[:150])
sad_words = ['fail', 'customerservice', 'usairwaysfail', 'badservice', 'bad', 'help', 'disappointed', 'neveragain', 'badcustomerservice', 'ripoff', 'nothappy', 'poorservice', 'delayed', 'wtf', 'ridiculous',
             'notcool', 'unitedsucks', 'unitedfail','pathetic', 'worstairlineever', 'worst','frustrat','shameful','rude','sarcasm','epicfail','unacceptable','worstcustomerservice','poorcustomerservice',
             'terribleservice','incompetent','customerservicefail','usairwayssucks','stranded', 'furious','scam','notgoodenough','terrible','annoyed','delay','bademployeeproblem','problem','bad']

In [20]:
train['career'] = train['atmentions'].apply(lambda atmention: "".join([x for x in atmention if x in airlines]) if len([x for x in atmention if x in airlines]) <=1 else "")
test['career'] = test['atmentions'].apply(lambda atmention: "".join([x for x in atmention if x in airlines]) if len([x for x in atmention if x in airlines]) <=1 else "")

In [21]:
test

Unnamed: 0,id,text,text1,atmentions,hashtags,atmentions_count,hashtags_count,career
0,7322,@AmericanAir In car gng to DFW. Pulled over 1h...,"[@americanair, in, car, gng, to, dfw, pulled, ...",[americanair],[],1,0,americanair
1,7323,"@AmericanAir after all, the plane didnÂÃÂªt ...","[@americanair, after, all, the, plane, didn, Â...",[americanair],[],1,0,americanair
2,7324,@SouthwestAir can't believe how many paying cu...,"[@southwestair, can't, believe, how, many, pay...",[southwestair],[],1,0,southwestair
3,7325,@USAirways I can legitimately say that I would...,"[@usairways, i, can, legitimately, say, that, ...",[usairways],[],1,0,usairways
4,7326,@AmericanAir still no response from AA. great ...,"[@americanair, still, no, response, from, aa, ...",[americanair],[],1,0,americanair
...,...,...,...,...,...,...,...,...
7315,14637,@JetBlue Traveling with two kids tomorrow (age...,"[@jetblue, traveling, with, two, kids, tomorro...",[jetblue],[],1,0,jetblue
7316,14638,@JetBlue Tx for the info. Just don't understan...,"[@jetblue, tx, for, the, info, just, don't, un...",[jetblue],[],1,0,jetblue
7317,14639,@AmericanAir I understand. But why is this the...,"[@americanair, i, understand, but, why, is, th...",[americanair],[],1,0,americanair
7318,14640,@USAirways really!??,"[@usairways, really]",[usairways],[],1,0,usairways


In [22]:
# train['sad_tweet'] = train['text1'].apply(lambda text: len([x for x in text if y in x for y in sad_words]))
# train[train['sad_tweet']>1]
train['sad_tweet']=0
row=0
for i in train['text1']:
  for j in sad_words:
    for k in i:
      if j in k or k in j:
        train.iloc[row,9] +=1
      else:
        pass
  row +=1


test['sad_tweet']=0
row=0
for i in test['text1']:
  for j in sad_words:
    for k in i:
      if j in k or k in j:
        test.iloc[row,8] +=1
      else:
        pass
  row +=1

In [23]:
# Tokenizing the data
train['text2'] = train['text'].apply(tokenize, args = (8,True, True)).str.join(" ")
train['text2'] = train['text2'].apply(tokenize, args = (6,True, True))
train['text2'] = train['text2'].apply(remove_stopwords)

test['text2'] = test['text'].apply(tokenize, args = (8,True, True)).str.join(" ")
test['text2'] = test['text2'].apply(tokenize, args = (6,True, True))
test['text2'] = test['text2'].apply(remove_stopwords)

In [24]:

# train['text_token_1'] = train['text_token_0'].apply(stemmer, args = ('lancaster',''))#, args=(0,True,True))
# train[['text_token_1','text_token_0']]

In [25]:
train['text2'] = train['text2'].apply(lemma, args = ('wordnet','v'))
train['text2'] = train['text2'].apply(stemmer, args = ('porter',''))

test['text2'] = test['text2'].apply(lemma, args = ('wordnet','v'))
test['text2'] = test['text2'].apply(stemmer, args = ('porter',''))

In [26]:
train['text3'] = train['text2'].apply(n_grams, args = (2,''))

test['text3'] = test['text2'].apply(n_grams, args = (2,''))

  


In [27]:
train['text2'] = train['text2'].str.join(" ")
test['text2'] = test['text2'].str.join(" ")

### Count Vectorizer

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
count_vectorizer = CountVectorizer(analyzer = 'word', stop_words = None, ngram_range = (1,2), max_df = 0.95, min_df = 2, max_features = 9999)
x_text_train = count_vectorizer.fit_transform(train['text2']).todense()
x_text_test = count_vectorizer.transform(test['text2']).todense()

In [30]:
dictionary = count_vectorizer.vocabulary_.items()  
words = []
count = []
for key, value in dictionary:
    words.append(key)
    count.append(value)
vocab_bef_stem = pd.DataFrame({"words":words, 'count_before_actions':count})

In [31]:
x_train = pd.concat([pd.DataFrame(x_text_train), train[['atmentions_count', 'hashtags_count', 'career', 'sad_tweet']]], axis = 1)
x_test = pd.concat([pd.DataFrame(x_text_test), test[['atmentions_count', 'hashtags_count', 'career', 'sad_tweet']]], axis = 1)

In [32]:
x_train = pd.concat([x_train, pd.get_dummies(x_train['career'], prefix='career_')], axis = 1)
x_train.drop(columns = 'career', inplace = True)

x_test = pd.concat([x_test, pd.get_dummies(x_test['career'], prefix='career_')], axis = 1)
x_test.drop(columns = 'career', inplace = True)

### Model

### Grid Search

In [34]:
from sklearn.naive_bayes import MultinomialNB
from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
parameter_grid_nb = {"alpha":[0.1,0.5,1,2,10]}
classifier = MultinomialNB()
classifier_name = classifier.__class__.__name__
gridsearch = GridSearchCV(classifier, parameter_grid_nb, scoring = 'f1_macro', cv = 9, verbose = 1, n_jobs = -1)
grid_result = gridsearch.fit(x_train, train['Target'])
print(grid_result.best_score_)

Fitting 9 folds for each of 5 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   38.7s finished


0.6743172338681247


In [None]:
parameter_grid_xgb = {'learning_rate': [0.05, 0.02, 0.03], 
                   'max_depth': [3,5,7], 'min_child_weight': [1,5,10], 'subsample': [0.6, 0.8, 1], 
                   'colsample_bytree': [0.5, 0.7, 0.9], 'n_estimators': [600],  'missing':[-999], 'seed': [42]}

classifier = XGBClassifier()
classifier_name = classifier.__class__.__name__
gridsearch_xgb = GridSearchCV(classifier, parameter_grid_xgb, scoring = 'f1_macro', cv = 4, verbose = 1, n_jobs = -1)
grid_result_xgb = gridsearch_xgb.fit(x_train, train['Target'])
print(grid_result_xgb.best_score_)

Fitting 4 folds for each of 243 candidates, totalling 972 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:
parameter_grid_log = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
classifier = LogisticRegression(random_state = 42)
classifier_name = classifier.__class__.__name__
gridsearch_log = GridSearchCV(classifier, parameter_grid_log, scoring = 'f1_macro', cv = 9, verbose = 1, n_jobs = -1)
grid_result_log = gridsearch_log.fit(x_train, train['Target'])
print(grid_result_log.best_score_)

Fitting 9 folds for each of 14 candidates, totalling 126 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
