In [10]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer # tfidf weighting
import re
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest, RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
import pickle
import plotly.express as px
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score,f1_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import LocallyLinearEmbedding, TSNE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
'''nltk.download('stopwords')

nltk.download('wordnet')

nltk.download('punkt')'''

'''%cd "/Users/keeganmoseley/Desktop/Roux/CS6140 - Machine Learning/Final Project"
os.listdir()'''
cuneiform_whole = pd.read_csv("train.csv")
cuneiform_df = cuneiform_whole.sample(frac=0.25, random_state=251)
cuneiform_df.reset_index(drop=True, inplace = True)

In [11]:
def pca_assessment(cuneiform_df, vectorizer, pca):
    #split the data into training and testing sets
    X = cuneiform_df['cuneiform']
    y = cuneiform_df['lang']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=251)

    train_df = pd.DataFrame({'cuneiform': X_train, 'lang': y_train})
    test_df = pd.DataFrame({'cuneiform': X_test, 'lang': y_test})
    train_df.sort_values(by='lang', inplace=True)
    test_df.sort_values(by='lang', inplace=True)

    train_df.reset_index(drop=True, inplace = True)
    test_df.reset_index(drop=True, inplace = True)

    print("Data has been split!")

    #-----------------------Create a DTM---------------------------------------------

    #train the model
    tfidf_model = vectorizer.fit(train_df["cuneiform"])

    #apply it to the training data
    tfidf_matrix_train = vectorizer.transform(train_df["cuneiform"])

    #turn results into a document term matrix
    dense_matrix_train = tfidf_matrix_train.todense()   

    # Retrieve feature names (tokens)
    feature_names = vectorizer.get_feature_names_out() 

    # Create a DataFrame with the dense matrix and feature names as columns
    tfidf_df_train = pd.DataFrame(dense_matrix_train, columns=feature_names)

    #Create document term matrix of Test data
    tfidf_matrix_test = vectorizer.transform(test_df["cuneiform"])

    #turn results into a document term matrix
    dense_matrix_test = tfidf_matrix_test.todense()   

    # Retrieve feature names (tokens)
    feature_names = vectorizer.get_feature_names_out() 

    # Create a DataFrame with the dense matrix and feature names as columns
    tfidf_df_test = pd.DataFrame(dense_matrix_test, columns=feature_names)

    #----------------------- Preprocess and Standardize the Data ----------------------

    #Standardize data
    scaler = StandardScaler()
    scaler.fit(tfidf_df_train)
    training_df_standardized = pd.DataFrame(scaler.transform(tfidf_df_train), columns=tfidf_df_train.columns) #dataframe of standardized data
    test_df_standardized = pd.DataFrame(scaler.transform(tfidf_df_test), columns=tfidf_df_test.columns)

    #find local outliers with LOF
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01)
    # predict the labels for each data point (as Outlier or inlier)
    pred_lof = lof.fit_predict(training_df_standardized)

    #find outliers with isolation forrest
    iforest = IsolationForest(n_estimators=100,  contamination=0.01)
    # predict the labels for each data point (as Outlier or inlier)
    pred_if = iforest.fit_predict(training_df_standardized)

    #Add columns of the outlier results
    training_df_standardized["pred_lof"] = pred_lof
    training_df_standardized["pred_if"] = pred_if

    '''#find how many outliers there are
    lof_count = (training_df_standardized["pred_lof"] == -1).sum()
    if_count = (training_df_standardized["pred_if"] == -1).sum()
    total_count = ((training_df_standardized["pred_lof"] == -1) & (training_df_standardized["pred_if"] == -1)).sum()

    print("Outliers found by Local Outlier Factor :", lof_count)
    print("Outliers found by Isolation Forrest : ",if_count)
    print("Outliers Tagged by Both : ",total_count)

    percentage_outliers = (lof_count + if_count)/(training_df_standardized.shape[0])
    print(percentage_outliers * 100,'% of the training data are outliers')'''

    #remove the outliers
    outliers = training_df_standardized[(training_df_standardized["pred_lof"] == -1) | (training_df_standardized["pred_if"] == -1)][["pred_lof", "pred_if"]]
    #print("Outliers Found By Either Algorithm :")
    #display(outliers)

    #remove outliers from dataframe
    outlier_indices = outliers.index
    training_df_standardized.drop(outlier_indices, axis=0)
    train_df.drop(outlier_indices, axis= 0)


    # ---------------------------- Dimension Reduction -------------------------
    x_subset_train = training_df_standardized.drop(['pred_lof', 'pred_if'], axis=1).values
    y_subset_train = train_df["lang"].values

    x_subset_test = test_df_standardized.values
    y_subset_test = test_df["lang"].values

    pca_trained = pca.fit(x_subset_train)
    pca_train_results = pca_trained.transform(x_subset_train)
    pca_test_results = pca_trained.transform(x_subset_test)

    #evaluate the pc variance captured
    explained_variance_ratio = pca_trained.explained_variance_ratio_
    variance_df = pd.DataFrame(data={
        'Principal Component': [f'PC{i+1}' for i in range(len(explained_variance_ratio))],
        'Explained Variance' : explained_variance_ratio,
        'Cumulative Variance' : explained_variance_ratio.cumsum()
    })

    return [pca_train_results, pca_test_results, y_subset_train, y_subset_test, variance_df]

def pca_no_outliers_removed(cuneiform_df, vectorizer, pca):
    #split the data into training and testing sets
    X = cuneiform_df['cuneiform']
    y = cuneiform_df['lang']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=251)

    train_df = pd.DataFrame({'cuneiform': X_train, 'lang': y_train})
    test_df = pd.DataFrame({'cuneiform': X_test, 'lang': y_test})
    train_df.sort_values(by='lang', inplace=True)
    test_df.sort_values(by='lang', inplace=True)

    train_df.reset_index(drop=True, inplace = True)
    test_df.reset_index(drop=True, inplace = True)

    print("Data has been split!")

    #-----------------------Create a DTM---------------------------------------------

    #train the model
    tfidf_model = vectorizer.fit(train_df["cuneiform"])

    #apply it to the training data
    tfidf_matrix_train = vectorizer.transform(train_df["cuneiform"])

    #turn results into a document term matrix
    dense_matrix_train = tfidf_matrix_train.todense()   

    # Retrieve feature names (tokens)
    feature_names = vectorizer.get_feature_names_out() 

    # Create a DataFrame with the dense matrix and feature names as columns
    tfidf_df_train = pd.DataFrame(dense_matrix_train, columns=feature_names)

    #Create document term matrix of Test data
    tfidf_matrix_test = vectorizer.transform(test_df["cuneiform"])

    #turn results into a document term matrix
    dense_matrix_test = tfidf_matrix_test.todense()   

    # Retrieve feature names (tokens)
    feature_names = vectorizer.get_feature_names_out() 

    # Create a DataFrame with the dense matrix and feature names as columns
    tfidf_df_test = pd.DataFrame(dense_matrix_test, columns=feature_names)

    #----------------------- Preprocess and Standardize the Data ----------------------

    #Standardize data
    scaler = StandardScaler()
    scaler.fit(tfidf_df_train)
    training_df_standardized = pd.DataFrame(scaler.transform(tfidf_df_train), columns=tfidf_df_train.columns) #dataframe of standardized data
    test_df_standardized = pd.DataFrame(scaler.transform(tfidf_df_test), columns=tfidf_df_test.columns)

    # ---------------------------- Dimension Reduction -------------------------
    x_subset_train = training_df_standardized
    y_subset_train = train_df["lang"].values

    x_subset_test = test_df_standardized.values
    y_subset_test = test_df["lang"].values

    pca_trained = pca.fit(x_subset_train)
    pca_train_results = pca_trained.transform(x_subset_train)
    pca_test_results = pca_trained.transform(x_subset_test)

    #evaluate the pc variance captured
    explained_variance_ratio = pca_trained.explained_variance_ratio_
    variance_df = pd.DataFrame(data={
        'Principal Component': [f'PC{i+1}' for i in range(len(explained_variance_ratio))],
        'Explained Variance' : explained_variance_ratio,
        'Cumulative Variance' : explained_variance_ratio.cumsum()
    })

    return [pca_train_results, pca_test_results, y_subset_train, y_subset_test, variance_df]


In [12]:
svd_results_outliers_removed = pca_assessment(cuneiform_df, TfidfVectorizer(analyzer='char'), TruncatedSVD(n_components=400, random_state=251))

Data has been split!


In [13]:
svd_results_no_outliers_removed = pca_no_outliers_removed(cuneiform_df, TfidfVectorizer(analyzer='char'), TruncatedSVD(n_components=400, random_state=251))

Data has been split!




In [14]:
print("SVD With Outlier Removal :")
display(svd_results_outliers_removed[4])

print("\nSVD Without Outlier Removal :")
display(svd_results_no_outliers_removed[4])

SVD With Outlier Removal :


Unnamed: 0,Principal Component,Explained Variance,Cumulative Variance
0,PC1,0.005051,0.005051
1,PC2,0.004145,0.009196
2,PC3,0.004045,0.013242
3,PC4,0.003775,0.017016
4,PC5,0.003603,0.020620
...,...,...,...
395,PC396,0.001681,0.874949
396,PC397,0.001677,0.876625
397,PC398,0.001666,0.878292
398,PC399,0.001661,0.879953



SVD Without Outlier Removal :


Unnamed: 0,Principal Component,Explained Variance,Cumulative Variance
0,PC1,0.005051,0.005051
1,PC2,0.004145,0.009196
2,PC3,0.004045,0.013242
3,PC4,0.003775,0.017016
4,PC5,0.003603,0.020620
...,...,...,...
395,PC396,0.001681,0.874949
396,PC397,0.001677,0.876625
397,PC398,0.001666,0.878292
398,PC399,0.001661,0.879953


In [15]:
def grid_search_func(x_subset, y_subset, model, param_grid, iterations):
    #grid = RandomizedSearchCV(model, param_grid, random_state=251, scoring='f1_samples' ,n_iter=iterations, n_jobs=-1, return_train_score=True)

    grid = RandomizedSearchCV(model, param_grid, random_state=251, scoring = 'f1_weighted', n_iter=iterations, n_jobs=-1, return_train_score=True)

    #fit the model
    grid.fit(x_subset, y_subset)

    #best parameters
    best_params = grid.best_params_

    #df of parameters and their r2 scores
    param_results = pd.DataFrame(grid.cv_results_)
    
    return [best_params, param_results]

In [16]:

gradient_boost_grid = {
    'random_state' : [251],
    'loss' : ['log_loss'],
    'learning_rate' : [0.1],
    'max_iter' : [200, 500],
    'max_depth' : [100, 200, 500],
}
#{'random_state': 251, 'max_iter': 200, 'max_depth': 100, 'loss': 'log_loss', 'learning_rate': 0.1}	 best params

In [17]:
#remove outliers
gb_result_outliers_removed = grid_search_func(svd_results_outliers_removed[0], svd_results_outliers_removed[2], HistGradientBoostingClassifier(), gradient_boost_grid, 5)
gb_param_df_outliers_removed = gb_result_outliers_removed[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nGrid Results of Gradient Boost Model using TFIDFVectorizer, Outlier Removal, then 400 Component Truncated SVD")
display(gb_param_df_outliers_removed)
print("Best Parameters : ", gb_result_outliers_removed[0])


Grid Results of Gradient Boost Model using TFIDFVectorizer, Outlier Removal, then 400 Component Truncated SVD


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.746006,1
1,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.746006,1
2,"{'random_state': 251, 'max_iter': 500, 'max_de...",0.746006,1
3,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.746006,1
4,"{'random_state': 251, 'max_iter': 500, 'max_de...",0.746006,1


Best Parameters :  {'random_state': 251, 'max_iter': 200, 'max_depth': 100, 'loss': 'log_loss', 'learning_rate': 0.1}


In [18]:
#keep outliers
gb_result_no_outliers_removed = grid_search_func(svd_results_no_outliers_removed[0], svd_results_no_outliers_removed[2], HistGradientBoostingClassifier(), gradient_boost_grid, 5)
gb_param_df_no_outliers_removed = gb_result_no_outliers_removed[1][['params','mean_test_score', 'rank_test_score']].sort_values(by = 'rank_test_score', kind = 'mergesort')

print("\nGrid Results of Gradient Boost Model using TFIDFVectorizer, No Outlier Removal, then 400 Component Truncated SVD")
display(gb_param_df_no_outliers_removed)
print("Best Parameters : ", gb_result_no_outliers_removed[0])


Grid Results of Gradient Boost Model using TFIDFVectorizer, No Outlier Removal, then 400 Component Truncated SVD


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.746006,1
1,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.746006,1
2,"{'random_state': 251, 'max_iter': 500, 'max_de...",0.746006,1
3,"{'random_state': 251, 'max_iter': 200, 'max_de...",0.746006,1
4,"{'random_state': 251, 'max_iter': 500, 'max_de...",0.746006,1


Best Parameters :  {'random_state': 251, 'max_iter': 200, 'max_depth': 100, 'loss': 'log_loss', 'learning_rate': 0.1}
