In [11]:
import re
from collections import Counter

In [12]:
# This function helps eliminate tokens that are punctutation or whitespace.
def punct_space(token):

    '''
    NOTE: 

    Unlike verbs and common nouns, there is no clear base form of a personal pronoun. 
    It is not clear if the lemma of "me" sould be "I", or whether a person is normalized as "it" or "she" or "he"? 
    spaCy's solution is to introduce a novel symbol, -PRON-, which is used as the lemma for all personal pronouns.
    '''

    # I will return the actual text if I encounter a pronoun instead of using spaCy's -PRON-  
    # Otherwise, I wil have -PRON- all over the headline
    if token.lemma_ == '-PRON-': # if the lemma is -PRON- I just return the lower case of the personal pronoun.
        return token.lower_ 
    else:
        return token.is_punct or token.is_space 
    

# This function returns TRUE for alphanumeric characters 
def get_alphanumeric(token):
    return token.is_alpha or token.is_digit

# A generator function that reads headlines from "headline_text_all.txt" (the text file that contains all headline text)
def get_line_headline(filename):
    with open(filename, 'r') as f:
        for headline in f:
            yield headline
            
# This function lemmatizes/normalizes a token (in lowercase)            
def lemm(token):
    return token.lemma_.lower()

In [13]:
# This fucntion outputs a DataFrame that contains the count of each word in the corpus
def word_freq(clean_text_list, top_n):
    flat = [item for sublist in clean_text_list for item in sublist]
    with_counts = Counter(flat)
    top = with_counts.most_common(top_n)
    word = [each[0] for each in top]
    num = [each[1] for each in top]
    return pd.DataFrame(list(zip(word, num)), columns=["words", "counts"])

In [15]:
# This function computes the word count of a list of tokens
def word_count(text):
    return len(str(text).split(' '))

In [17]:
# Get bigrams and their frequency in each class
def ngrams_freq(clean_text_list, top_n):
    
    # get a large list of all tokens
    flat_w_bigrams = [item for sublist in clean_text_list for item in sublist]
    # init a list of bigrarms
    bigrams = []
    
    # look at each token in my corpus
    for token in flat_w_bigrams:
        if '_' in token: # a bigrarm always has '_', for example "happy_hour"
            bigrams.append(token)
    counts = Counter(bigrams) # count the number of bigrams
    top = counts.most_common(top_n)
    word = [each[0] for each in top]
    num = [each[1] for each in top]
    return pd.DataFrame([word, num]).T

In [19]:
# This function computes a sorted list of the top topics in the LDA representation
def lda_description(headline_text, min_topic_freq=0.05):
    """
    1) accept the original text of a headline 
    2) parse it with spaCy
    3) apply text pre-proccessing steps
    4) create a bag-of-words representation of the document
    5) create an LDA representation of the document
    6) print a sorted list of the top topics in the LDA representation
    """
    topic_names = {0: "Business", 1: "World", 2: "SciTech", 3:"Sports"}
    
    # parse the headline text with spaCy
    parsed_headline = nlp(headline_text)
    
    # lemmatize the text and remove punctuation and whitespace
    headline_unigrams = [
        lemm(token)
        for token in parsed_headline
        if (not punct_space(token) and get_alphanumeric(token))
        ]
    
    # apply the first-order and secord-order phrase models
    headline_bigrams = bigram_phrases_model[headline_unigrams]
    headline_trigrams = trigram_phrases_model[headline_unigrams]
    
    # remove any remaining stopwords
    headline_trigrams = [t for t in headline_trigrams if not t in nlp.Defaults.stop_words]
        
    # create a bag-of-words representation
    headline_bow = dictionary_trigrams.doc2bow(headline_trigrams)
    
    # create an LDA representation
    headline_lda = lda[headline_bow]
    
    # sort with the most highly related topics first
    headline_lda = sorted(headline_lda, key=lambda topic_number_freq: -topic_number_freq[-1])
    
    for topic_number, freq in headline_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        print(f'{topic_names[topic_number]:25} {round(freq, 3):.3f}')

In [22]:
# Function to get the scores for each model in a df
def model_score(model_dict):   
    model_name, acc_score_list, precision_score_list, recall_score_list, f1_score_list = [], [], [], [], []
    
    for k,v in model_dict.items():   
        
        model_name.append(k)
        
        # Train model and make predictions
        v.fit(X_train, y_train)
        y_pred = v.predict(X_test)
        
        # Model performance
        acc_score_list.append(accuracy_score(y_test, y_pred))
        # compute precision of each class and take average
        precision_score_list.append(precision_score(y_test, y_pred, average='macro')) 
        recall_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        
        # Organize metrics for each model in a DataFrame
        model_comparison_df = pd.DataFrame(list(zip(model_name, acc_score_list, precision_score_list, 
                                            recall_score_list, f1_score_list)), 
                                            columns= ['model_name', 'accuracy_score', 'precision_score', 
                                                      'recall_score', 'f1_score'])
        # sort by f1_score
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
   
    return model_comparison_df

In [24]:
# Function to perform GridSearchCV to find the model with the "best" parameters. 
# The best model has the highest CV accuracy.

def fit_assess(X_train, y_train):
    """ 
    Fit a list of models with the training data one at a time and obtain 
    models with the best parameters (have the lowest test error from CV)
    """
    best_estimators = [] # a list of "best" estimators: a best SVD, a best rf and a best adb
    
    for name in models.keys():
        est = models[name]  # initial model object
        est_params = params[name] # parameters for the model
        gscv = GridSearchCV(estimator=est, 
                            param_grid=est_params, 
                            cv=5,
                            verbose = 2,
                            n_jobs=-1)  #GridSearchCV model
        gscv.fit(X_train, y_train)
        print("The best parameters are: {}".format(gscv.best_params_))
        
        # a list of trained GridSearchCV objects with the "best" parameters that result 
        # in the highest estimated test accuracy (from CV)
        best_estimators.append(gscv) 

    return best_estimators