In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


#Importing training data
df = pd.read_csv('./train.csv')
df.dropna(axis=0) #Drop rows with missing data
df.set_index('id', inplace = True) 
df.head()

In [None]:
#Packages required for pos_tag to work.
# import nltk
# nltk.download('averaged_perceptron_tagger')
# nltk.download('universal_tagset')

In [None]:
#ADD 3 NEW FEATURES: NUMBER OF ADJECTIVES, NUMBER OF NOUNS, NUMBER OF VERBS
import re #Regexp
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize

stopWords = set(stopwords.words('english'))

#Adding a helper function for preprocessing function. The count_tags counts the number of occurences of 
# adjectives, nouns, verbs for a given corpus
def count_tags(text):
    tokens = word_tokenize(text)
    tags = pos_tag(tokens, tagset='universal')
    adjectives = sum(1 for word, tag in tags if tag.startswith('ADJ'))
    nouns = sum(1 for word, tag in tags if tag.startswith('NOUN'))
    verbs = sum(1 for word, tag in tags if tag.startswith('VERB'))
    
    return adjectives,nouns,verbs

#Preprocessing function, to make it easy to replicate on  submission data.
#The function prepares the data for analysis by doing various types of data cleaning and feature extractions.
#This function creates several columns "Features" by extracting features from the existing given text.
#Features are used to help model a correlation between the input data and its label. 
def processing(df):
    #lowering and removing punctuation
    df['processed'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))
    
    #numerical feature engineering
    #total length of sentence
    df['length'] = df['processed'].apply(lambda x: len(x))
    #get number of words including stop words
    df['words'] = df['processed'].apply(lambda x: len(x.split(' ')))
    #get number of words that aren't stopwords
    df['words_not_stopword'] = df['processed'].apply(lambda x: len([t for t in x.split(' ') if t not in stopWords]))
    #get the average word length
    df['avg_word_length'] = df['processed'].apply(lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopWords]) if len([len(t) for t in x.split(' ') if t not in stopWords]) > 0 else 0)
    #get the number of commas
    df['commas'] = df['text'].apply(lambda x: x.count(','))
    #get the number of adjectives, nouns, and verbs
    df['num_adjs'], df['num_nouns'], df['num_verbs'] = zip(*df['processed'].apply(count_tags))

    return(df)

#The dataframe is then passed into the function and is preproccessed.
df = processing(df)

df.head()

In [None]:
from sklearn.model_selection import train_test_split

#A list that contains the features that the model will use to be trained
not_pos_features = [c for c in df.columns.values if c  not in ['id','text','author','num_adjs','num_nouns','num_verbs']]
features = [c for c in df.columns.values if c  not in ['id','text','author']]
#A sublist of features that exclude the processed column
numeric_features = [c for c in df.columns.values if c  not in ['id','text','author','processed']]
#The class labels
target = 'author'

#Splits the data into a train set and a test set. The train set will be used to train the model
#And the test set will be used to calculate the model classifier's performance.
#33% of the data will be used to test and the random state is used to reproduce the same results.
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=42)
X_train.head()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

#TF-IDF stands for Term Frequency - Inverse Document Frequency and basically works by counting the frequency of 
#a word to measure its importance or weight. The more a frequent a word appears in the document, the more likely
#it is to fit in that class. The IDF is used to minimize the weight of common words that don't add meaning to the
#classification of the text.
text = Pipeline([
                ('selector', TextSelector(key='processed')),
                ('tfidf', TfidfVectorizer( stop_words='english'))
            ])

text.fit_transform(X_train)

In [None]:
from sklearn.preprocessing import StandardScaler

#The Pipeline has two components; a selector which uses the NumberSelector transformer. And the StandardScaler
#which is a preprocessing step that standardizes the each key column. This operation is performed on all the columns
#before being concatenated in the FeatureUnion.
length =  Pipeline([
                ('selector', NumberSelector(key='length')),
                ('standard', StandardScaler())
            ])

length.fit_transform(X_train)

words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_not_stopword')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas')),
                ('standard', StandardScaler()),
            ])
num_adjs = Pipeline([
                ('selector', NumberSelector(key='num_adjs')),
                ('standard', StandardScaler())
            ])
num_nouns = Pipeline([
                ('selector', NumberSelector(key='num_nouns')),
                ('standard', StandardScaler())
            ])
num_verbs = Pipeline([
                ('selector', NumberSelector(key='num_verbs')),
                ('standard', StandardScaler())
            ])

from sklearn.pipeline import FeatureUnion

#Joins all of the pipelines, merging all of the feature sets from different transformers
feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas)])
#Same as above except this includes add'l features:  Number of adjectives, Number of Nouns, Number of verbs
pos_feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas),
                      ('num_adjs', num_adjs),
                      ('num_nouns', num_nouns),
                      ('num_verbs', num_verbs)])

#showing that the entire union can also be handled as one giant pipline.
feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
#Creates a pipeline that includes the features and the logisitic regression classifier.
pipeline = Pipeline([
    ('features',feats),
    ('logistic', LogisticRegression(random_state = 42,max_iter=1000)),
])
#The model is then trained/fit with the training data
pipeline.fit(X_train[not_pos_features], y_train)
#Now that the model has been given training data, it can be used to make informed predictions on the classification
#of unseen data which it is tested upon.
preds = pipeline.predict(X_test[not_pos_features])
#How accurate the model was at correctly classifying unseen data
np.mean(preds == y_test)


In [None]:
#A report showing the performance of the classifier
#Compute precision, recall, F-measure and support for each class.
#Precision: % of class x predictions that are actually class x
#Recall: % of true class x items that are predicted as class x
#F1-Score: A combined measure that assesses the P/R tradeoff is F-measure (weighted harmonic mean)
print(classification_report(y_test, preds))

In [None]:
#Creates a pipeline that includes the features and the logisitic regression classifier.
pos_pipeline = Pipeline([
    ('features',pos_feats),
    ('logistic', LogisticRegression(random_state = 42,max_iter=1000)),
])
#The model is then trained/fit with the training data
pos_pipeline.fit(X_train, y_train)
#Now that the model has been given training data, it can be used to make informed predictions on the classification
#of unseen data which it is tested upon.
pos_preds = pos_pipeline.predict(X_test)
#A measure of how accurate the model was at correctly classifying unseen data. The new accuracy is lower which
#may mean that the feature weights may need to be adjusted or there is no correlation between the new features
# and the class labels.
np.mean(pos_preds == y_test)

In [None]:
#A report showing the performance of the classifier with added features
#Compute precision, recall, F-measure and support for each class.
#Precision: % of class x predictions that are actually class x
#Recall: % of true class x items that are predicted as class x
#F1-Score: A combined measure that assesses the P/R tradeoff is F-measure (weighted harmonic mean)
print(classification_report(y_test, pos_preds))

In [None]:
#You may skip the CV & fine tuning part, if you like.
pipeline.get_params().keys()

from sklearn.model_selection import GridSearchCV
#Hyperparameters that will be tested to see which will help the model perform better on generalized/unseen data
hyperparameters = {
    'logistic__C': [0.001, 0.01, 0.1, 1, 10],  # Specify other hyperparameters here
    'logistic__class_weight': [None, 'balanced'],
}
#GridSearch uses the hyperparameters defined above to see which works at returning optized predictions.
clf = GridSearchCV(pipeline, hyperparameters, cv=5)
 
# Fit and tune model
clf.fit(X_train[not_pos_features], y_train)

#refitting on entire training data using best settings
clf.refit

#Classifier is makes a new set of predictions with updated optimal hyperparameters.
preds = clf.predict(X_test[not_pos_features])
probs = clf.predict_proba(X_test[not_pos_features])
#A measure of how accurate the model was at correctly classifying unseen data. The number has also increase
#which indicates that the adjusted hyperparameters were useful.
np.mean(preds == y_test)

In [None]:
print(classification_report(y_test, preds))

In [None]:
#GridSearch uses the hyperparameters defined above to see which works at returning optized predictions.
pos_clf = GridSearchCV(pos_pipeline, hyperparameters, cv=5)
# Fit and tune model
pos_clf.fit(X_train, y_train)

#refitting on entire training data using best settings
pos_clf.refit
#Classifier is makes a new set of predictions with updated optimal hyperparameters.
pos_preds = pos_clf.predict(X_test)
pos_probs = pos_clf.predict_proba(X_test)
#A measure of how accurate the model was at correctly classifying unseen data. The number has also increase
#which indicates that the adjusted hyperparameters were useful, but still lower than without the part of speech
#feature additions.
np.mean(pos_preds == y_test)

In [None]:
print(classification_report(y_test, pos_preds))

In [None]:
#Reads in a test file for add'l model testing
submission = pd.read_csv('./test.csv')

#preprocessing step. Prepares the test data by extracting features from text.
submission = processing(submission)
#model is making predictions on the unseen data with the optimized hyperparameters.
predictions = clf.predict_proba(submission[not_pos_features])
preds = pd.DataFrame(data=predictions, columns = clf.best_estimator_.named_steps['logistic'].classes_)

#generating a submission file
result = pd.concat([submission[['id']], preds], axis=1)
result.set_index('id', inplace = True)
result.head()

In [None]:
#pos tagged version: model is making predictions on the unseen data with the optimized hyperparameters. Overall
#the added features decreased the models performance.
pos_predictions = pos_clf.predict_proba(submission)
pos_preds = pd.DataFrame(data=pos_predictions, columns = clf.best_estimator_.named_steps['logistic'].classes_)

#generating a submission file
pos_result = pd.concat([submission[['id']], pos_preds], axis=1)
pos_result.set_index('id', inplace = True)
pos_result.head()

### Share three observations about how model performance changed after the addition of the three new features and why that might be.

(E.g. Did precision decrease or increase, and why?)  

The accuracy is lower: I think that this is a result of the features skewing the accuracy of the models predictions. Also the classes are likely to be unbalanced. 

The precision for MWS lowered: Since there was little to no correlation between the added vectors, it helped increased the chances of false positives in the denominator. The model was less strict.

Recall for HPL increased: The model is more lenient in classifying the HPL class. The f eatures may be highly relevant to that class

The F1 score for HPL increased by 0.01 and decreased for MWS by 0.01: A resulting change from what happened with recall and precision class measures.

# Homework 10

In [None]:
# 2 - Fold
#GridSearch uses the hyperparameters defined above to see which works at returning optized predictions.
cv2 = GridSearchCV(pos_pipeline, hyperparameters, cv=2)
# Fit and tune model
cv2.fit(X_train, y_train)

#refitting on entire training data using best settings
cv2.refit
#Classifier is makes a new set of predictions with updated optimal hyperparameters.
cv2_preds = cv2.predict(X_test)
cv2_probs = cv2.predict_proba(X_test)
#A measure of how accurate the model was at correctly classifying unseen data. The number has also increase
#which indicates that the adjusted hyperparameters were useful, but still lower than without the part of speech
#feature additions.
np.mean(cv2_preds == y_test)

In [None]:
#10 - Fold
#GridSearch uses the hyperparameters defined above to see which works at returning optized predictions.
cv10 = GridSearchCV(pos_pipeline, hyperparameters, cv=10)
# Fit and tune model
cv10.fit(X_train, y_train)

#refitting on entire training data using best settings
cv10.refit
#Classifier is makes a new set of predictions with updated optimal hyperparameters.
cv10_preds = cv10.predict(X_test)
cv10_probs = cv10.predict_proba(X_test)
#A measure of how accurate the model was at correctly classifying unseen data. The number has also increase
#which indicates that the adjusted hyperparameters were useful, but still lower than without the part of speech
#feature additions.
np.mean(cv10_preds == y_test)

In [None]:
#20 - Fold
#GridSearch uses the hyperparameters defined above to see which works at returning optized predictions.
cv20 = GridSearchCV(pos_pipeline, hyperparameters, cv=20)
# Fit and tune model
cv20.fit(X_train, y_train)

#refitting on entire training data using best settings
cv20.refit
#Classifier is makes a new set of predictions with updated optimal hyperparameters.
cv20_preds = cv20.predict(X_test)
cv20_probs = cv20.predict_proba(X_test)
#A measure of how accurate the model was at correctly classifying unseen data. The number has also increase
#which indicates that the adjusted hyperparameters were useful, but still lower than without the part of speech
#feature additions.
np.mean(cv20_preds == y_test)

In [None]:
#Print 10 most important and least important features for each class along with weights for prev added features
cv2_best_model = cv2.best_estimator_
cv2_coef = cv2_best_model.named_steps['logistic'].coef_ #This is an array of the feature weights

coefficients = pd.concat([pd.DataFrame(X_train.columns),pd.DataFrame(np.transpose(cv2_coef))], axis = 1)

coefficients.columns = ['feature','EAP','HPL','MWS']
replacement_words = text.named_steps['tfidf'].get_feature_names_out() #Words from processed text pipeline

#Replaces NaN entries with words from the TDIF Vectorizor list.
def replace_nan_with_word(row):
    index = row.name 
    if pd.notna(row['feature']):
        return row['feature']
    else:
        return replacement_words[index-8]
    
coefficients['feature'] = coefficients.apply(replace_nan_with_word, axis=1)

print("Top 10 Most important and Least important features for EAP")
print(coefficients.sort_values(by=['EAP'], ascending=False).head(10),"\n\n",coefficients.sort_values(by=['EAP'], ascending=True).head(10))

In [None]:
print("Top 10 Most important and Least important features for HPL")
print(coefficients.sort_values(by=['HPL'], ascending=False).head(10),"\n\n",coefficients.sort_values(by=['HPL'], ascending=True).head(10))

In [None]:
print("Top 10 Most important and Least important features for MWS")
print(coefficients.sort_values(by=['MWS'], ascending=False).head(10),"\n\n",coefficients.sort_values(by=['MWS'], ascending=True).head(10))

In [None]:
#Weights learned for Num adjs, num verbs, and num nouns
coefficients.head(9)

In [None]:
pos_preds
results = test_data.copy()
results['predicted_label'] = predictions