In [1]:
import os
import pandas as pd
import numpy as np
import pickle

from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, make_scorer, precision_score, recall_score
from sklearn.model_selection import KFold, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

os.chdir('/Users/ianduke/Desktop/for_tarak')

from text_analysis_toolkit import TextAnalysisToolkit

# Create an instance of the toolkit
toolkit = TextAnalysisToolkit()


## **S1**: Save transcript text to a dataframe. Clean + apply embedding function.

In [None]:
# Save all transcripts to one csv
directory_path = # ADD PATH
output_path_csv = #ADD PATH

filepaths, contents = toolkit.collect_txt_files_data(directory_path)
toolkit.save_data_to_csv(filepaths, contents, output_path_csv)

data = pd.read_csv(output_path_csv)

labels = pd.read_csv(# ADD PATH)

In [3]:
# Extract just the filename from filepath
data['filename'] = data['filepath'].str.split('/').str[-1]

# Only include labeled files
data = data[data['filename'].isin(labels['File'])]

# Reset index after dropping unlabeled files
data = data.reset_index(drop = True)

In [4]:
# Clean the transcripts
data['transcript'] = data['transcript'].apply(toolkit.clean_transcript)

In [5]:
# Great! now lets encode our all sentences in our fancy new transcript dataframe
data_encoded_sbert= toolkit.encode_sentences_sbert(data, 'transcript')

In [8]:
# Clean transcription and embedding columns
data_encoded_sbert['transcript'] = data_encoded_sbert['transcript'].apply(lambda x: str(x) if pd.notnull(x) else "")
data_encoded_sbert['embeddings'] = data_encoded_sbert['embeddings'].apply(lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else x)

### **S2**: Create one dataframe containing merged video data and search labels

In [9]:
# Clean merged_df
merged_df = data_encoded_sbert.merge(labels, left_on='filename', right_on='File', how='inner')
merged_df = merged_df.loc[:, ~merged_df.columns.str.startswith('Unnamed')]
merged_df = merged_df.loc[:, ~merged_df.columns.str.endswith('_text')]

### **S3**: Define reference sentences for cosine similarity features"

In [12]:
all_sentences = toolkit.related_sentences

In [None]:
# Load Embedding Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Run custom model to classify transcripts
for i, sentence in enumerate(tqdm(all_sentences, desc="Processing sentences")):
    target_sentence = sentence
    data_final = toolkit.find_closest_sentences_sbert(merged_df, 'transcript','embeddings', target_sentence, model)

### **S4**: Engineer additional features not based on cosine similarity

In [None]:
# Apply the function along the rows (axis=1) and create new columns in the DataFrame
data_final[['num_questions', 'num_sentences']] = data_final.apply(toolkit.count_questions_and_sentences, axis=1)

In [None]:
# Calculate aggregated embeddings
data_final['mean_embedding'] = ''
data_final['sum_embedding'] = ''
data_final['mean_embeddings_final_5'] = ''
data_final['mean_embeddings_first_5'] = ''
for i in range(len(data_final)):
    data_final['mean_embedding'][i] = np.mean(data_final['embeddings'][i])
    data_final['sum_embedding'][i] = np.sum(data_final['embeddings'][i])
    data_final['mean_embeddings_final_5'][i] = np.mean(data_final['embeddings'][i][-5:])
    data_final['mean_embeddings_first_5'][i] = np.mean(data_final['embeddings'][i][:5])

In [None]:
# Add counts for keywords
keywords = ['confiscated', 'confiscate', 'search', 'marijuana', 'consent', 'weed', 'look', 'open', 'trunk']
data_final = toolkit.count_keywords_in_transcripts_case_insensitive(data_final, keywords)

### **S5**: Train model

In [19]:
data_final.fillna(0, inplace=True)

X = data_final.drop(['Search?', 'transcript', 'embeddings', 'File', 'filename', 'sentences'], axis = 1)
X = X.drop(columns=[col for col in X.columns if col.endswith('_similarity_text')])


y = data_final['Search?']

X_train_files, X_test_files, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state =13)

X_train = X_train_files.drop(['filepath'], axis = 1)
X_test = X_test_files.drop(['filepath'], axis = 1)

In [20]:
# first let's rescale our data
scaler = StandardScaler()

# Fit the scaler to the data and transform it
x_train_sc_array = scaler.fit_transform(X_train)
x_test_sc_array = scaler.transform(X_test)

# Convert the scaled array back to a DataFrame
x_train_sc = pd.DataFrame(x_train_sc_array, columns=X_train.columns)
x_test_sc = pd.DataFrame(x_test_sc_array, columns=X_train.columns)

In [21]:
# Drop raw text from merged df data
merged_df = merged_df.drop('transcript', axis = 1)

# Reset y_train index
y_train.reset_index(drop = True, inplace = True)

In [None]:
best_model_info_L1 = toolkit.fit_logistic_and_find_best_score(x_train_sc, y_train)
best_model_info_L1

In [23]:
# Identify only non-zero coefficients
non_zero_coefficients = []
for element in best_model_info_L1['feature_coefficients']:
    if element[1] != 0:
        non_zero_coefficients.append(element[0])

In [25]:
# Filter the original X_train DataFrame to keep only significant features
X_train_filtered = X_train[non_zero_coefficients]

# Filter the scaled x_train_sc DataFrame similarly
x_train_sc_filtered = x_train_sc[non_zero_coefficients]

x_test_sc_filtered = x_test_sc[non_zero_coefficients]

In [26]:
best_model_info_L1 = toolkit.fit_logistic_and_find_best_score(x_train_sc_filtered, y_train)

In [None]:
best_alpha = best_model_info_L1['best_alpha']
model = LogisticRegression(penalty='l1', C=1/best_alpha, solver='liblinear', random_state=25, class_weight='balanced')
model.fit(x_train_sc_filtered, y_train)

# Get probability estimates for the test data
probabilities = model.predict_proba(x_train_sc_filtered)

# Apply the custom threshold of 0.5 to the positive class's probability estimates
custom_threshold = 0.4
predictions_custom_threshold = (probabilities[:, 1] > custom_threshold).astype(int)

# Now you can evaluate your model using these custom predictions
test_accuracy_custom = accuracy_score(y_train, predictions_custom_threshold)
test_precision_custom = precision_score(y_train, predictions_custom_threshold, zero_division=0)
test_recall_custom = recall_score(y_train, predictions_custom_threshold, zero_division=0)

# Print the evaluation metrics
print(f'Train Accuracy with custom threshold: {test_accuracy_custom}')
print(f'Train Precision with custom threshold: {test_precision_custom}')
print(f'Train Recall with custom threshold: {test_recall_custom}')

### **S6**: After selecting final model, run on unseen test set

In [None]:
# Get probability estimates for the test data
probabilities = model.predict_proba(x_test_sc_filtered)

# Apply the custom threshold of 0.5 to the positive class's probability estimates
custom_threshold = 0.4
predictions_custom_threshold = (probabilities[:, 1] > custom_threshold).astype(int)

# Now you can evaluate your model using these custom predictions
test_accuracy_custom = accuracy_score(y_test, predictions_custom_threshold)
test_precision_custom = precision_score(y_test, predictions_custom_threshold, zero_division=0)
test_recall_custom = recall_score(y_test, predictions_custom_threshold, zero_division=0)

# Print the evaluation metrics
print(f'Test Accuracy with custom threshold: {test_accuracy_custom}')
print(f'Test Precision with custom threshold: {test_precision_custom}')
print(f'Test Recall with custom threshold: {test_recall_custom}')


### **S7**: Export Trained Model to Pickle!

In [47]:
with open('search_classifier_09_01.pkl', 'wb') as f:
    pickle.dump(model, f)