##### This file is to determine if outlier detection leads to better or worse results. On one hand, removing outliers makes the model more generalized. But on the other hand, the classes are imbalanced in this dataset, and some classes have rare symbols that are unique to them. This could lead to rows that are important for training on these smaller categories being identified as outliers, or rows with infrequent symbols being identified as outliers. 

##### I am also going to continue experimenting with min_df to reduce dimensions. Finally, I will determine if I am getting better results with CountVectorization of TFIDFVectorization, since in  "Cuneiform Identification Dimension Reduction and Model Exploration.ipynb" the CountVectorized data was very competitive with, and sometimes performed better, than TFIDF.

In [13]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer # tfidf weighting
from sklearn.feature_extraction.text import CountVectorizer #frequency distribution
import re
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.covariance import EllipticEnvelope # Mahalanobis Distance
import numpy as np
import matplotlib.pyplot as plt
import pickle
import plotly.express as px
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score,f1_score, make_scorer
import seaborn as sns
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
'''nltk.download('stopwords')

nltk.download('wordnet')

nltk.download('punkt')'''

%cd "/Users/keeganmoseley/Desktop/Roux/CS6140 - Machine Learning/Final Project"
os.listdir()
cuneiform_whole = pd.read_csv("train.csv")
cuneiform_df = cuneiform_whole.sample(frac=0.25, random_state=251)
cuneiform_df.reset_index(drop=True, inplace = True)

[WinError 3] The system cannot find the path specified: '/Users/keeganmoseley/Desktop/Roux/CS6140 - Machine Learning/Final Project'
c:\Users\keega\Desktop\Coding\ML Final


In [14]:

def preprocess_data_remove_outliers(cuneiform_df, vectorizer):
    #split the data into training and testing sets
    X = cuneiform_df['cuneiform']
    y = cuneiform_df['lang']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=251)

    train_df = pd.DataFrame({'cuneiform': X_train, 'lang': y_train})
    test_df = pd.DataFrame({'cuneiform': X_test, 'lang': y_test})
    train_df.sort_values(by='lang', inplace=True)
    test_df.sort_values(by='lang', inplace=True)

    train_df.reset_index(drop=True, inplace = True)
    test_df.reset_index(drop=True, inplace = True)

    print("Data has been split!")

    #-----------------------Create a DTM---------------------------------------------

    #train the model
    tfidf_model = vectorizer.fit(train_df["cuneiform"])

    #apply it to the training data
    tfidf_matrix_train = vectorizer.transform(train_df["cuneiform"])

    #turn results into a document term matrix
    dense_matrix_train = tfidf_matrix_train.todense()   

    # Retrieve feature names (tokens)
    feature_names = vectorizer.get_feature_names_out() 

    # Create a DataFrame with the dense matrix and feature names as columns
    tfidf_df_train = pd.DataFrame(dense_matrix_train, columns=feature_names)

    #Create document term matrix of Test data
    tfidf_matrix_test = vectorizer.transform(test_df["cuneiform"])

    #turn results into a document term matrix
    dense_matrix_test = tfidf_matrix_test.todense()   

    # Retrieve feature names (tokens)
    feature_names = vectorizer.get_feature_names_out() 

    # Create a DataFrame with the dense matrix and feature names as columns
    tfidf_df_test = pd.DataFrame(dense_matrix_test, columns=feature_names)

    #----------------------- Preprocess and Standardize the Data ----------------------

    #Standardize data
    scaler = StandardScaler()
    scaler.fit(tfidf_df_train)
    training_df_standardized = pd.DataFrame(scaler.transform(tfidf_df_train), columns=tfidf_df_train.columns) #dataframe of standardized data
    test_df_standardized = pd.DataFrame(scaler.transform(tfidf_df_test), columns=tfidf_df_test.columns)

    #find local outliers with LOF
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01)
    # predict the labels for each data point (as Outlier or inlier)
    pred_lof = lof.fit_predict(training_df_standardized)

    #find outliers with isolation forrest
    iforest = IsolationForest(n_estimators=100,  contamination=0.01)
    # predict the labels for each data point (as Outlier or inlier)
    pred_if = iforest.fit_predict(training_df_standardized)

    #Add columns of the outlier results
    training_df_standardized["pred_lof"] = pred_lof
    training_df_standardized["pred_if"] = pred_if

    '''#find how many outliers there are
    lof_count = (training_df_standardized["pred_lof"] == -1).sum()
    if_count = (training_df_standardized["pred_if"] == -1).sum()
    total_count = ((training_df_standardized["pred_lof"] == -1) & (training_df_standardized["pred_if"] == -1)).sum()

    print("Outliers found by Local Outlier Factor :", lof_count)
    print("Outliers found by Isolation Forrest : ",if_count)
    print("Outliers Tagged by Both : ",total_count)

    percentage_outliers = (lof_count + if_count)/(training_df_standardized.shape[0])
    print(percentage_outliers * 100,'% of the training data are outliers')'''

    #remove the outliers
    outliers = training_df_standardized[(training_df_standardized["pred_lof"] == -1) | (training_df_standardized["pred_if"] == -1)][["pred_lof", "pred_if"]]
    #print("Outliers Found By Either Algorithm :")
    #display(outliers)

    #remove outliers from dataframe
    outlier_indices = outliers.index
    training_df_standardized.drop(outlier_indices, axis=0)
    train_df.drop(outlier_indices, axis= 0)


    x_subset_train = training_df_standardized.drop(['pred_lof', 'pred_if'], axis=1).values
    y_subset_train = train_df["lang"].values

    x_subset_test = test_df_standardized.values
    y_subset_test = test_df["lang"].values

    return [x_subset_train, x_subset_test, y_subset_train, y_subset_test]

def preprocess_data_keep_outliers(cuneiform_df, vectorizer):
    #split the data into training and testing sets
    X = cuneiform_df['cuneiform']
    y = cuneiform_df['lang']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=251)

    train_df = pd.DataFrame({'cuneiform': X_train, 'lang': y_train})
    test_df = pd.DataFrame({'cuneiform': X_test, 'lang': y_test})
    train_df.sort_values(by='lang', inplace=True)
    test_df.sort_values(by='lang', inplace=True)

    train_df.reset_index(drop=True, inplace = True)
    test_df.reset_index(drop=True, inplace = True)

    print("Data has been split!")

    #-----------------------Create a DTM---------------------------------------------

    #train the model
    tfidf_model = vectorizer.fit(train_df["cuneiform"])

    #apply it to the training data
    tfidf_matrix_train = vectorizer.transform(train_df["cuneiform"])

    #turn results into a document term matrix
    dense_matrix_train = tfidf_matrix_train.todense()   

    # Retrieve feature names (tokens)
    feature_names = vectorizer.get_feature_names_out() 

    # Create a DataFrame with the dense matrix and feature names as columns
    tfidf_df_train = pd.DataFrame(dense_matrix_train, columns=feature_names)

    #Create document term matrix of Test data
    tfidf_matrix_test = vectorizer.transform(test_df["cuneiform"])

    #turn results into a document term matrix
    dense_matrix_test = tfidf_matrix_test.todense()   

    # Retrieve feature names (tokens)
    feature_names = vectorizer.get_feature_names_out() 

    # Create a DataFrame with the dense matrix and feature names as columns
    tfidf_df_test = pd.DataFrame(dense_matrix_test, columns=feature_names)

    #----------------------- Preprocess and Standardize the Data ----------------------

    #Standardize data
    scaler = StandardScaler()
    scaler.fit(tfidf_df_train)
    training_df_standardized = pd.DataFrame(scaler.transform(tfidf_df_train), columns=tfidf_df_train.columns) #dataframe of standardized data
    test_df_standardized = pd.DataFrame(scaler.transform(tfidf_df_test), columns=tfidf_df_test.columns)

    x_subset_train = training_df_standardized.values
    y_subset_train = train_df["lang"].values

    x_subset_test = test_df_standardized.values
    y_subset_test = test_df["lang"].values

    return [x_subset_train, x_subset_test, y_subset_train, y_subset_test]


'''stock_tfidf_result = preprocess_data(cuneiform_df, TfidfVectorizer(analyzer='char'))
stock_count_result = preprocess_data(cuneiform_df, CountVectorizer(analyzer='char'))'''

"stock_tfidf_result = preprocess_data(cuneiform_df, TfidfVectorizer(analyzer='char'))\nstock_count_result = preprocess_data(cuneiform_df, CountVectorizer(analyzer='char'))"

In [34]:
#experiment with min/max df too 
vectorizer = TfidfVectorizer(analyzer='char')

#train the model
vectorizer_results = vectorizer.fit_transform(cuneiform_whole["cuneiform"])

#turn results into a document term matrix
dense_matrix = vectorizer_results.todense()   

# Retrieve feature names (tokens)
feature_names = vectorizer.get_feature_names_out() 

# Create a DataFrame with the dense matrix and feature names as columns
tfidf_df = pd.DataFrame(dense_matrix, columns=feature_names)

# Display the DataFrame
display(tfidf_df)

Unnamed: 0,Unnamed: 1,𒀀,𒀁,𒀂,𒀅,𒀆,𒀈,𒀉,𒀊,𒀋,...,𒓙,𒓚,𒓥,𒓭,𒓯,𒔃,𒔊,𒔋,𒔎,𒔚
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.174883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.268276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.268276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.202143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139416,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139417,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139418,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139419,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
#experiment with min/max df too 
vectorizer = TfidfVectorizer(analyzer='char', min_df=25)

#train the model
vectorizer_results = vectorizer.fit_transform(cuneiform_whole["cuneiform"])

#turn results into a document term matrix
dense_matrix = vectorizer_results.todense()   

# Retrieve feature names (tokens)
feature_names = vectorizer.get_feature_names_out() 

# Create a DataFrame with the dense matrix and feature names as columns
tfidf_df = pd.DataFrame(dense_matrix, columns=feature_names)

# Display the DataFrame
display(tfidf_df)

Unnamed: 0,𒀀,𒀉,𒀊,𒀏,𒀕,𒀖,𒀚,𒀜,𒀝,𒀞,...,𒐊,𒐐,𒐕,𒐴,𒐼,𒑏,𒑚,𒑛,𒓊,𒔃
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.174883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.268276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.268276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.202143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139416,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139417,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139418,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139419,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
#experiment with min/max df too 
vectorizer = TfidfVectorizer(analyzer='char', min_df=50)

#train the model
vectorizer_results = vectorizer.fit_transform(cuneiform_whole["cuneiform"])

#turn results into a document term matrix
dense_matrix = vectorizer_results.todense()   

# Retrieve feature names (tokens)
feature_names = vectorizer.get_feature_names_out() 

# Create a DataFrame with the dense matrix and feature names as columns
tfidf_df = pd.DataFrame(dense_matrix, columns=feature_names)

# Display the DataFrame
display(tfidf_df)

Unnamed: 0,𒀀,𒀉,𒀊,𒀏,𒀕,𒀖,𒀚,𒀜,𒀝,𒀞,...,𒍪,𒍮,𒐊,𒐐,𒐕,𒐴,𒑏,𒑚,𒑛,𒓊
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.174883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.268276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.268276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.202143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139416,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139417,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139418,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139419,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
outliers_removed_all_tokens = preprocess_data_remove_outliers(cuneiform_df, TfidfVectorizer(analyzer='char' ))
outliers_removed_pruned_tokens = preprocess_data_remove_outliers(cuneiform_df, TfidfVectorizer(analyzer='char', min_df=25))

outliers_kept_all_tokens = preprocess_data_keep_outliers(cuneiform_df, TfidfVectorizer(analyzer='char' ))
outliers_kept_pruned_tokens = preprocess_data_keep_outliers(cuneiform_df, TfidfVectorizer(analyzer='char', min_df=25))

Data has been split!
Data has been split!
Data has been split!
Data has been split!


In [17]:
def grid_search_func(x_subset, y_subset, model, param_grid, iterations):
    #grid = RandomizedSearchCV(model, param_grid, random_state=251, scoring='f1_samples' ,n_iter=iterations, n_jobs=-1, return_train_score=True)

    grid = RandomizedSearchCV(model, param_grid, random_state=251, scoring = 'f1_weighted', n_iter=iterations, n_jobs=-1, return_train_score=True)

    #fit the model
    grid.fit(x_subset, y_subset)

    #best parameters
    best_params = grid.best_params_

    #df of parameters and their r2 scores
    param_results = pd.DataFrame(grid.cv_results_)
    
    return [best_params, param_results]

### Random Forrest

In [18]:
random_forrest_grid = {
    'n_jobs' : [-1],
    'random_state' : [251],
    'criterion' : ['log_loss'],
    'max_depth' : [200, 300, 400],
    'min_samples_split' : [2, 4, 8],
    'min_samples_leaf' : [1, 2, 4],
    'max_features' : ['sqrt', 0.2, 0.5]
}

#Best Results from Dimension Reduction and Model Exploration file
'''Grid Results of Random Forrest Model, with TFIDF and No Dimension Reduction
params	mean_test_score	rank_test_score
0	{'random_state': 251, 'n_jobs': -1, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 200, 'criterion': 'log_loss'}	0.771084	1
6	{'random_state': 251, 'n_jobs': -1, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 0.2, 'max_depth': 200, 'criterion': 'log_loss'}	0.761744	2
7	{'random_state': 251, 'n_jobs': -1, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 0.2, 'max_depth': 200, 'criterion': 'log_loss'}	0.757923	3
'''
#-----------------Scores with outlier removal, no min/max df -------------------
outliers_removed_no_prune_result_rf = grid_search_func(outliers_removed_all_tokens[0], outliers_removed_all_tokens[2] , RandomForestClassifier(), random_forrest_grid, 10)
outliers_removed_params_no_prune_rf = outliers_removed_no_prune_result_rf[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nGrid Results of Random Forrest Model, With Outlier Removal and No min/max df")
display(outliers_removed_params_no_prune_rf)
print("Best Parameters : ", outliers_removed_no_prune_result_rf[0])


Grid Results of Random Forrest Model, With Outlier Removal and No min/max df


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.771953,1
6,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.761745,2
7,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.757923,3
4,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.755876,4
2,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.752705,5
1,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.738806,6
3,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.738806,6
9,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.738806,6
5,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.718809,9
8,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.718809,9


Best Parameters :  {'random_state': 251, 'n_jobs': -1, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 400, 'criterion': 'log_loss'}


In [19]:
#-----------------Outliers removed, and Vectorized Pruning -------------------
outliers_removed_pruned_result_rf = grid_search_func(outliers_removed_pruned_tokens[0], outliers_removed_pruned_tokens[2] , RandomForestClassifier(), random_forrest_grid, 10)
outliers_removed_params_pruned_rf = outliers_removed_pruned_result_rf[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nGrid Results of Random Forrest Model, With Outlier Removal, min_df = 25")
display(outliers_removed_params_pruned_rf)
print("Best Parameters : ", outliers_removed_pruned_result_rf[0])


Grid Results of Random Forrest Model, With Outlier Removal, min_df = 25


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.768242,1
7,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.758944,2
6,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.758851,3
4,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.755668,4
2,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.752016,5
1,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.739126,6
3,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.739126,6
9,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.739126,6
5,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.725703,9
8,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.725703,9


Best Parameters :  {'random_state': 251, 'n_jobs': -1, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 400, 'criterion': 'log_loss'}


In [20]:
#-----------------Scores without outlier removal, no min/max df -------------------
outliers_kept_no_prune_result_rf = grid_search_func(outliers_kept_all_tokens[0], outliers_kept_all_tokens[2] , RandomForestClassifier(), random_forrest_grid, 10)
outliers_kept_params_no_prune_rf = outliers_kept_no_prune_result_rf[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nGrid Results of Random Forrest Model, Without Outlier Removal and No min/max df")
display(outliers_kept_params_no_prune_rf)
print("Best Parameters : ", outliers_kept_no_prune_result_rf[0])


Grid Results of Random Forrest Model, Without Outlier Removal and No min/max df


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.771953,1
6,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.761745,2
7,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.757923,3
4,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.755876,4
2,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.752705,5
1,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.738806,6
3,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.738806,6
9,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.738806,6
5,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.718809,9
8,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.718809,9


Best Parameters :  {'random_state': 251, 'n_jobs': -1, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 400, 'criterion': 'log_loss'}


In [21]:
#-----------------Outliers kept, and Vectorized Pruning -------------------
outliers_kept_pruned_result_rf = grid_search_func(outliers_kept_pruned_tokens[0], outliers_kept_pruned_tokens[2] , RandomForestClassifier(), random_forrest_grid, 10)
outliers_kept_params_pruned_rf = outliers_kept_pruned_result_rf[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nGrid Results of Random Forrest Model, Without Outlier Removal, min_df = 25")
display(outliers_kept_params_pruned_rf)
print("Best Parameters : ", outliers_kept_pruned_result_rf[0])


Grid Results of Random Forrest Model, Without Outlier Removal, min_df = 25


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.768242,1
7,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.758944,2
6,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.758851,3
4,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.755668,4
2,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.752016,5
1,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.739126,6
3,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.739126,6
9,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.739126,6
5,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.725703,9
8,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.725703,9


Best Parameters :  {'random_state': 251, 'n_jobs': -1, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 400, 'criterion': 'log_loss'}


### Gradient Boost

In [24]:
gradient_boost_grid = {
    'random_state' : [251],
    'loss' : ['log_loss'],
    'learning_rate' : [0.1, 0.2],
    'max_iter' : [200, 300],
    'max_depth' : [200, 300],
}

'''Grid Results of Gradient Boost Model, with TFIDF and No Dimension Reduction
params	mean_test_score	rank_test_score
0	{'random_state': 251, 'max_iter': 200, 'max_depth': 100, 'loss': 'log_loss', 'learning_rate': 0.1}	0.794158	1
8	{'random_state': 251, 'max_iter': 200, 'max_depth': 200, 'loss': 'log_loss', 'learning_rate': 0.1}	0.794158	1
1	{'random_state': 251, 'max_iter': 200, 'max_depth': 100, 'loss': 'log_loss', 'learning_rate': 0.2}	0.790683	3'''


"Grid Results of Gradient Boost Model, with TFIDF and No Dimension Reduction\nparams\tmean_test_score\trank_test_score\n0\t{'random_state': 251, 'max_iter': 200, 'max_depth': 100, 'loss': 'log_loss', 'learning_rate': 0.1}\t0.794158\t1\n8\t{'random_state': 251, 'max_iter': 200, 'max_depth': 200, 'loss': 'log_loss', 'learning_rate': 0.1}\t0.794158\t1\n1\t{'random_state': 251, 'max_iter': 200, 'max_depth': 100, 'loss': 'log_loss', 'learning_rate': 0.2}\t0.790683\t3"

In [25]:
#-----------------Scores with outlier removal, no min/max df -------------------
outliers_removed_no_prune_result_gb = grid_search_func(outliers_removed_all_tokens[0], outliers_removed_all_tokens[2] , HistGradientBoostingClassifier(), gradient_boost_grid, 10)
outliers_removed_params_no_prune_gb = outliers_removed_no_prune_result_gb[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nGrid Results of Gradient Boost Model, With Outlier Removal and No min/max df")
display(outliers_removed_params_no_prune_gb)
print("Best Parameters : ", outliers_removed_no_prune_result_gb[0])




Grid Results of Gradient Boost Model, With Outlier Removal and No min/max df


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.794158,1
1,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.794158,1
2,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.794158,1
3,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.794158,1
4,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.790683,5
5,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.790683,5
6,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.790683,5
7,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.790683,5


Best Parameters :  {'random_state': 251, 'max_iter': 200, 'max_depth': 200, 'loss': 'log_loss', 'learning_rate': 0.1}


In [26]:
#-----------------Scores with outlier removal, with Vectorized Pruning -------------------
outliers_removed_pruned_result_gb = grid_search_func(outliers_removed_pruned_tokens[0], outliers_removed_pruned_tokens[2] , HistGradientBoostingClassifier(), gradient_boost_grid, 10)
outliers_removed_params_pruned_gb = outliers_removed_pruned_result_gb[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nGrid Results of Gradient Boost Model, With Outlier Removal, min_df = 25")
display(outliers_removed_params_pruned_gb)
print("Best Parameters : ", outliers_removed_pruned_result_gb[0])




Grid Results of Gradient Boost Model, With Outlier Removal, min_df = 25


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.79322,1
1,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.79322,1
2,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.79322,1
3,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.79322,1
4,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.789649,5
5,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.789649,5
6,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.789649,5
7,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.789649,5


Best Parameters :  {'random_state': 251, 'max_iter': 200, 'max_depth': 200, 'loss': 'log_loss', 'learning_rate': 0.1}


In [27]:
#-----------------Scores without outlier removal, without Vectorized Pruning -------------------
outliers_kept_no_prune_result_gb = grid_search_func(outliers_kept_all_tokens[0], outliers_kept_all_tokens[2] , HistGradientBoostingClassifier(), gradient_boost_grid, 10)
outliers_kept_no_prune_params_gb = outliers_kept_no_prune_result_gb[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nGrid Results of Gradient Boost Model, Without Outlier Removal and no min/max df")
display(outliers_kept_no_prune_params_gb)
print("Best Parameters : ", outliers_kept_no_prune_result_gb[0])




Grid Results of Gradient Boost Model, Without Outlier Removal and no min/max df


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.794158,1
1,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.794158,1
2,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.794158,1
3,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.794158,1
4,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.790683,5
5,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.790683,5
6,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.790683,5
7,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.790683,5


Best Parameters :  {'random_state': 251, 'max_iter': 200, 'max_depth': 200, 'loss': 'log_loss', 'learning_rate': 0.1}


In [28]:
#-----------------Scores without outlier removal, with Vectorized Pruning -------------------
outliers_kept_pruned_result_gb = grid_search_func(outliers_kept_pruned_tokens[0], outliers_kept_pruned_tokens[2] , HistGradientBoostingClassifier(), gradient_boost_grid, 10)
outliers_kept_pruned_params_gb = outliers_kept_pruned_result_gb[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nGrid Results of Gradient Boost Model, Without Outlier Removal, min_df = 25")
display(outliers_kept_pruned_params_gb)
print("Best Parameters : ", outliers_kept_pruned_result_gb[0])




Grid Results of Gradient Boost Model, Without Outlier Removal, min_df = 25


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.79322,1
1,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.79322,1
2,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.79322,1
3,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.79322,1
4,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.789649,5
5,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.789649,5
6,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.789649,5
7,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.789649,5


Best Parameters :  {'random_state': 251, 'max_iter': 200, 'max_depth': 200, 'loss': 'log_loss', 'learning_rate': 0.1}


It seems like 2% outlier detection is not causing much of a difference. I'll remove outliers going forward. Min_df = 25 removes a bit under 200 columns but it only marginally decreased the best test score. I think this tradeoff is worth it, since the test scores are nearly the same but the curse of dimensionality is reduced. I am hoping that with the correct hyperparameters of the model, I will be able to get better performance with these settings. 

### CountVectorization vs TFIDFVectorization

In [39]:
tfidf_result = preprocess_data_keep_outliers(cuneiform_df, TfidfVectorizer(analyzer='char', min_df=25))
count_result = preprocess_data_keep_outliers(cuneiform_df, CountVectorizer(analyzer='char', min_df=25))

Data has been split!
Data has been split!


In [40]:
#Random Forrest and TFIDF
tfidf_grid_result_rf = grid_search_func(tfidf_result[0], tfidf_result[2] , RandomForestClassifier(), random_forrest_grid, 10)
tfidf_grid_params_rf = tfidf_grid_result_rf[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nRandom Forrest Grid Results of TFIDF, With Outlier Removal and min_df = 25")
display(tfidf_grid_params_rf)
print("Best Parameters : ", tfidf_grid_result_rf[0])


Random Forrest Grid Results of TFIDF, With Outlier Removal and min_df = 25


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.768242,1
7,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.758944,2
6,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.758851,3
4,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.755668,4
2,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.752016,5
1,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.739126,6
3,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.739126,6
9,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.739126,6
5,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.725703,9
8,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.725703,9


Best Parameters :  {'random_state': 251, 'n_jobs': -1, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 400, 'criterion': 'log_loss'}


In [41]:
#random forrest and CountVec
count_grid_result_rf = grid_search_func(count_result[0], count_result[2] , RandomForestClassifier(), random_forrest_grid, 10)
count_grid_params_rf = count_grid_result_rf[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nRandom Forrest Grid Results of CountVectorization, With Outlier Removal and min_df = 25")
display(count_grid_params_rf)
print("Best Parameters : ", count_grid_result_rf[0])


Random Forrest Grid Results of CountVectorization, With Outlier Removal and min_df = 25


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.756958,1
6,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.756326,2
7,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.754615,3
2,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.747753,4
4,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.745073,5
1,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.725056,6
3,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.725056,6
9,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.725056,6
5,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.705896,9
8,"{'random_state': 251, 'n_jobs': -1, 'min_sampl...",0.705896,9


Best Parameters :  {'random_state': 251, 'n_jobs': -1, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 400, 'criterion': 'log_loss'}


In [43]:
#Gradient Boost and TFIDF
tfidf_grid_result_gb = grid_search_func(tfidf_result[0], tfidf_result[2] , HistGradientBoostingClassifier(), gradient_boost_grid, 10)
tfidf_grid_params_gb = tfidf_grid_result_gb[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nGradient Boost Grid Results of TFIDF, With Outlier Removal and min_df = 25")
display(tfidf_grid_params_gb)
print("Best Parameters : ", tfidf_grid_result_gb[0])




Gradient Boost Grid Results of TFIDF, With Outlier Removal and min_df = 25


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.79322,1
1,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.79322,1
2,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.79322,1
3,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.79322,1
4,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.789649,5
5,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.789649,5
6,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.789649,5
7,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.789649,5


Best Parameters :  {'random_state': 251, 'max_iter': 200, 'max_depth': 200, 'loss': 'log_loss', 'learning_rate': 0.1}


In [44]:
#Gradient Boost and CountVec
count_grid_result_gb = grid_search_func(count_result[0], count_result[2] , HistGradientBoostingClassifier(), gradient_boost_grid, 10)
count_grid_params_gb = count_grid_result_gb[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nGradient Boost Grid Results of CountVectorization, With Outlier Removal and min_df = 25")
display(count_grid_params_gb)
print("Best Parameters : ", count_grid_result_gb[0])




Gradient Boost Grid Results of CountVectorization, With Outlier Removal and min_df = 25


Unnamed: 0,params,mean_test_score,rank_test_score
1,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.797764,1
3,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.797764,1
0,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.797623,3
2,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.797623,3
4,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.795926,5
5,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.795926,5
6,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.795926,5
7,"{'random_state': 251, 'max_iter': 300, 'max_de...",0.795926,5


Best Parameters :  {'random_state': 251, 'max_iter': 300, 'max_depth': 200, 'loss': 'log_loss', 'learning_rate': 0.1}


With Gradient Boost, Count Vectorization performed slightly better. However there is only a ~0.4% difference. TFIDF penalizes common symbols. I know from my experiments with max_df that there aren't any symbols that are all that common. So, it could be that the penalization of TFIDF isn't adding that much meaning. I didn't record these max_df tests, so I'll demonstrate around how low I need to make this value in order for any columns to be removed.

In [45]:
#experiment with min/max df too 
vectorizer = TfidfVectorizer(analyzer='char')

#train the model
vectorizer_results = vectorizer.fit_transform(cuneiform_whole["cuneiform"])

#turn results into a document term matrix
dense_matrix = vectorizer_results.todense()   

# Retrieve feature names (tokens)
feature_names = vectorizer.get_feature_names_out() 

# Create a DataFrame with the dense matrix and feature names as columns
tfidf_df = pd.DataFrame(dense_matrix, columns=feature_names)

# Display the DataFrame
print(tfidf_df.shape)

(139421, 550)


In [46]:
#experiment with min/max df too 
vectorizer = TfidfVectorizer(analyzer='char', max_df=0.3)

#train the model
vectorizer_results = vectorizer.fit_transform(cuneiform_whole["cuneiform"])

#turn results into a document term matrix
dense_matrix = vectorizer_results.todense()   

# Retrieve feature names (tokens)
feature_names = vectorizer.get_feature_names_out() 

# Create a DataFrame with the dense matrix and feature names as columns
tfidf_df = pd.DataFrame(dense_matrix, columns=feature_names)

# Display the DataFrame
print(tfidf_df.shape)

(139421, 550)


In [47]:
#experiment with min/max df too 
vectorizer = TfidfVectorizer(analyzer='char', max_df=0.25)

#train the model
vectorizer_results = vectorizer.fit_transform(cuneiform_whole["cuneiform"])

#turn results into a document term matrix
dense_matrix = vectorizer_results.todense()   

# Retrieve feature names (tokens)
feature_names = vectorizer.get_feature_names_out() 

# Create a DataFrame with the dense matrix and feature names as columns
tfidf_df = pd.DataFrame(dense_matrix, columns=feature_names)

# Display the DataFrame
print(tfidf_df.shape)

(139421, 549)


In [50]:
#experiment with min/max df too 
vectorizer = TfidfVectorizer(analyzer='char', max_df=0.15)

#train the model
vectorizer_results = vectorizer.fit_transform(cuneiform_whole["cuneiform"])

#turn results into a document term matrix
dense_matrix = vectorizer_results.todense()   

# Retrieve feature names (tokens)
feature_names = vectorizer.get_feature_names_out() 

# Create a DataFrame with the dense matrix and feature names as columns
tfidf_df = pd.DataFrame(dense_matrix, columns=feature_names)

# Display the DataFrame
print(tfidf_df.shape)

(139421, 547)


It is clear from these that no individual cuneiform symbol is appearing that frequently. That could be why the extra context that TFIDF adds in order to de-emphasize words such as "the" "a" ect. is actually harming a dataset with these attributes. 