# Import Packages

In [4]:
import ast
import numpy as np
import os
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, make_scorer, precision_score, recall_score
from sklearn.model_selection import KFold, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import torch
from tqdm import tqdm

# Define functions to encode sentences & read in data

In [6]:
def encode_sentences_sbert(df, column_name):
    # Initialize SBERT model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Initialize a column for document-level embeddings
    df['embeddings'] = None

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        # Ensure the row value is a string before splitting
        text = str(row[column_name])
        
        # Split sentences in the document by both '.' and '?'
        sentences = re.split(r'[.?]+', text)
        
        # Filter out empty sentences after split
        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

        # Generate embeddings for all sentences in the document
        doc_embeddings = model.encode(sentences)

        # Store the list of embeddings for the document
        df.at[index, 'embeddings'] = [embedding.tolist() for embedding in doc_embeddings]

    return df


# Save transcript text to a dataframe, clean, and apply embedding function

In [7]:
# Let's first create a simple df with filenames and associated text

def collect_txt_files_data(directory_path):
    filepaths = []
    contents = []
    
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.txt'):
                filepath = os.path.join(root, file)
                filepaths.append(filepath)
                
                with open(filepath, 'r', encoding='utf-8') as f:
                    contents.append(f.read())
    
    return filepaths, contents


def save_data_to_csv(filepaths, contents, output_path):
    df = pd.DataFrame({
        'filepath': filepaths,
        'transcript': contents
    })
    
    df.to_csv(output_path, index=False)


In [33]:
# Save all transcripts to one csv
directory_path = # ADD TRANSCRIPT INPUT
output_path_csv = # ADD DESIRED CSV OUTPUT

filepaths, contents = collect_txt_files_data(directory_path)
save_data_to_csv(filepaths, contents, output_path_csv)

data = pd.read_csv(output_path_csv)

labels = pd.read_csv(# ADD MANUAL SEARCH/ NO SEARCH LABELS HERE)

In [27]:
# Remove any timestamps from transcripts

def clean_transcript(transcript):
    """
    Removes timestamps and unnecessary new lines from the transcript.
    
    Parameters:
    transcript (str): The transcript text.
    
    Returns:
    str: Cleaned transcript without timestamps and unnecessary new lines.
    """
    # Check if the input is a string
    if not isinstance(transcript, str):
        return transcript
    
    # Regular expression to match timestamps in the format 'number - number'
    timestamp_pattern = r'\d+\s*-\s*\d+'

    # Remove timestamps
    cleaned_transcript = re.sub(timestamp_pattern, '', transcript)
    
    # Remove unnecessary new lines and extra spaces
    cleaned_transcript = re.sub(r'\s*\n\s*', ' ', cleaned_transcript).strip()
    
    return cleaned_transcript


# Clean the transcripts
data['transcript'] = data['transcript'].apply(clean_transcript)


In [None]:
# great! now lets encode our all sentences in our fancy new transcript dataframe
data_encoded_sbert= encode_sentences_sbert(data, 'transcript')

# Find closest cosine distance for each transcript given a target phrase

In [38]:
def find_closest_sentences_sbert(df, transcript_column, embeddings_column, target_sentence, model):
    # Format the target sentence to create a valid column name
    target_phrase_column = "_".join(target_sentence.split()) + "_cosine_distance"
    
    # Use the provided model to encode the target sentence
    target_embedding = encode_texts_sbert([target_sentence], model)[0]

    # Initialize columns for the cosine distance and the closest sentence text
    df[target_phrase_column] = np.nan
    text_column = target_phrase_column + '_text'
    df[text_column] = None

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        sentence_embeddings = row[embeddings_column]
        sentences = re.split(r'[.?\n]+', row[transcript_column])
        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

        if not sentences:  # Skip if there are no sentences
            continue

        # Calculate cosine similarity between the target and all sentence embeddings
        similarities = [np.dot(target_embedding, embedding) / (np.linalg.norm(target_embedding) * np.linalg.norm(embedding)) for embedding in sentence_embeddings]
        
        # Find the index of the highest similarity score
        max_similarity_index = np.argmax(similarities)
        
        # Update the dataframe with the closest sentence and its similarity score
        df.at[index, target_phrase_column] = similarities[max_similarity_index]
        df.at[index, text_column] = sentences[max_similarity_index]

    return df

In [39]:
# Clean transcription and embedding columns
data_final = data_encoded_sbert
data_encoded_sbert['transcript'] = data_encoded_sbert['transcript'].apply(lambda x: str(x) if pd.notnull(x) else "")
data_encoded_sbert['embeddings'] = data_encoded_sbert['embeddings'].apply(lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else x)

# Initialize model outside of any functions
model = SentenceTransformer('all-MiniLM-L6-v2')  

# Experiment with feature importance (linear model)

### Heres the general idea for the following feature importance process: We will make a very large list containing any possible sentences whose semantic meaning may increase probability of a search occuring (ie our candidate sentences). We will then calculate cosine similarity between each candidate sentence and the closest matching sentence in every transcript. 

### Finally, utilizing these highest cosine similarity values we will run 50,000 logistic regressions with L1 regularization and different train/test splits for each iteration. Through this process, we will keep track of how many times each target phrase's cosine similarity score (or other extracted features) are significantly predictive and then return featues that are significantly predictive for the most train/test splits.

### List all potential predictive sentences:

In [40]:
transcript_sentences = [
    "Can you pop the thing so I can look at that sticker just to make sure that matches the paperwork?",
    "So you understand that I'm asking to search the vehicle.",
    "Is it okay if I look?",
    "Can I look at those bags in the back?",
    "Can you open up the back for me?",
    "Go to search the vehicle.",
    "And he's got marijuana.",
    "Did you search him?",
    "Use your body cam to take pictures.",
    "So let me finish searching the front",
    "I just wanted to finish getting this search.",
    "Ten six searched.",
    "I just wanted to finish getting this search.",
    "One set of plates, three fixed blade knives, one pair of bolt cutters.",
    "Alright, well, I'm going to take them and return to DMV because I'm assuming they're not registered to you.",
    "So what's on the property receipt?",
    "So apparently the window doesn't roll up all the way?",
    "I took my gloves off, man.",
    "I'm not going digging back in your center console again.",
    "All right, anything sharp, any needles, anything like that?",
    "You widen your stance a little bit.",
    "Anything in your boots?",
    "Any needles?",
    "Nothing tucked in your belt?",
    "We found a bottle of pills in the glove box.",
    "Step out for me.",
    "Anything on you?",
    "I'm going to search you real quick, okay?",
    "Hands on the hood.",
    "No needles?",
    "You can smell it from here.",
    "They stopped a guy because this rig just reeked of marijuana.",
    "Nothing sharp or anything in your pockets?",
    "You mind if we go in there and get whatever you're talking about?",
    "When you get out, I'll pat you down",
    "Were you drinking already, or was that already in here?",
    "What's in the backseat of my car?",
    "So we're going to take you back to my car, make sure you don't have anything on you you're not supposed to have, okay?",
    "Is there anything in the car that we need to be worried about?",
    "Any marijuana or anything?",
    "Guns, drugs, bazookas, bombs?",
    "What's in this pocket?",
    "I'm just going to detain you real quick, alright?",
    "Anything illegal in the car?",
    "To search it.",
    "And do you got anything that's going to poke me, make me bleed?",
    "Did you just recently use some math or anything like that?",
    "Did you just recently use some meth or anything like that?",
    "I  might have a knife in my left pocket",
    "In your left pocket?",
    "So you got anything illegal on your person?",
    "Something in the backpack",
    "So where's the knife located?",
    "Search the meth pipe, lesion, straw.",
    "Can you open your driver door for me so I can take a look at the door tag?",
    "Any marijuana or anything in here?",
    "Get your fucking hands out the window.",
    "Keep your hands up.",
    "Code for one in custody.",
    "You got search instrument to arrest me.",
    "Go glove up.",
    "If you want to get the front, I'll get the back.",
    "Watch out for sharps.",
    "Was that a weed pipe or a different pipe there at the floorboard?",
    "This here just weed.",
    "Yeah, the screwdriver and one blade.",
    "Under the center console.",
    "If you want to hop out, we'll do a quick patch, make sure you don't have anything you're not supposed to have, and we'll go from there.",
    "If you want to hop out, we'll do a quick pat, make sure you don't have anything you're not supposed to have, and we'll go from there.",
    "Anything illegal?",
    "I got pot in the car, but that's it.",
    "Heroin, needles?",
    "Any guns in the car?",
    "Put your hands on the car there"
]

generated_sentences = [
    "Mind if I check the trunk for anything unusual?",
    "I need to verify the VIN number against your registration.",
    "Could you step to the side while we conduct the search?",
    "We're looking for any illegal substances or items.",
    "Please remain calm while we complete our inspection.",
    "Do you have any firearms in the vehicle?",
    "I noticed your taillight is out; I'll need to take a closer look.",
    "We received a report of suspicious activity in this area.",
    "I'm going to run your plates through the system.",
    "Have you been involved in any recent criminal activity?",
    "Please provide your driver's license and registration.",
    "We're conducting random security checks today.",
    "Do you consent to a search of your vehicle?",
    "I'm detecting the odor of illegal substances.",
    "We found a suspicious package under the seat.",
    "You're not carrying any stolen goods, are you?",
    "Have there been any alterations to your vehicle?",
    "We'll need to take a closer look at your documents.",
    "Is there a reason your vehicle smells like alcohol?",
    "You seem nervous; is there anything you'd like to tell me?",
    "Are these items yours or do they belong to someone else?",
    "We're going to need to detain this item for further investigation.",
    "Do you have anything in your pockets that I should know about?",
    "We're checking vehicles for safety compliance.",
    "Your vehicle matches the description of one reported stolen.",
    "I'm going to need backup to conduct a thorough search.",
    "Have you given anyone else permission to use your vehicle?",
    "There's been a report of illegal activity in this make and model.",
    "We need to verify the ownership of this vehicle.",
    "Your vehicle was seen leaving the scene of a crime.",
    "I'll need to document everything in your vehicle.",
    "Is there a legal reason you have this equipment?",
    "We're investigating a series of incidents in this neighborhood.",
    "Your cooperation is appreciated during this process.",
    "I'm going to check the vehicle's undercarriage.",
    "Are you aware it's illegal to transport these items?",
    "We'll need to test this substance for narcotics.",
    "Do you have any proof of purchase for these items?",
    "I'm going to need to see inside your glove compartment.",
    "You're required by law to comply with this search.",
    "Please explain why you have this amount of cash.",
    "We're conducting checks for national security reasons.",
    "Your vehicle has been identified in a recent investigation.",
    "I'll be recording this interaction for our records.",
    "Do you have any objection to me looking in the backseat?",
    "We need to ensure there are no contraband or weapons.",
    "Please step back while I inspect the exterior.",
    "Are these substances prescribed to you?",
    "I'll need to verify these serial numbers.",
    "Your license plate came back with several alerts.",
    "I'm required to inform you of your rights before the search.",
    "We've had reports of trafficking in this area.",
    "I'm checking for any modifications to your vehicle.",
    "This is a routine check for DUI enforcement.",
    "Your vehicle's description matches a recent alert.",
    "We're looking for a missing person; have you seen anyone suspicious?",
    "I'll need to take this for further examination.",
    "You have the right to refuse, but that may raise suspicion.",
    "I'll be checking for any hidden compartments.",
    "This area is known for drug smuggling.",
    "We need to clear your vehicle before you proceed.",
    "I'm going to run a check on these items.",
    "Please remain here while I call for a K-9 unit.",
    "Your cooperation can significantly speed up this process.",
    "We're conducting a safety inspection on all vehicles in this area."
]

generated_sentences += [
    "Please keep your hands where I can see them while I inspect the vehicle.",
    "I'm checking for any objects that might be considered a threat.",
    "This search is for our safety and yours.",
    "We're almost done here, just a few more areas to check.",
    "I appreciate your patience during this process.",
    "Everything seems in order, but I need to check one last thing.",
    "Your cooperation is making this much easier, thank you.",
    "I'm looking for anything that might be hidden out of plain sight.",
    "This is standard procedure, we do this for all traffic stops in this area.",
    "I'll need to look under the seats, can you please step out?",
    "Do you have a spare key? I need to open the trunk.",
    "It's policy to check all compartments for any illegal items.",
    "Have you had any issues with the vehicle's locking mechanisms?",
    "I'm going to use a flashlight to look into darker areas of the car.",
    "I noticed some irregularities with your vehicle's documentation.",
    "Your vehicle fits the description of one involved in recent incidents.",
    "We're almost finished; just need to verify a few more details.",
    "For documentation purposes, I need to take a few photos.",
    "Do you have any luggage or other large items in the vehicle?",
    "I need to confirm the serial numbers on some of your belongings.",
    "We're looking for specific items related to our current investigation.",
    "Your vehicle's registration number has been flagged for a routine check.",
    "I'm going to call in to confirm some details about your vehicle.",
    "Please provide the insurance information for your vehicle.",
    "Have you recently purchased anything of high value?",
    "I'm required to check the vehicle identification number directly.",
    "Your vehicle has been marked for a random safety inspection.",
    "I'll need to remove some of the items to get a better look.",
    "Can you explain why there's a discrepancy with your vehicle's records?",
    "We're conducting a thorough investigation, and your vehicle is part of it.",
    "There's been an alert for vehicles of this make and model.",
    "I'll need to cross-reference the VIN with the national database.",
    "Please detail the contents of any containers or packages in the car.",
    "We're ensuring that there are no hazards within the vehicle.",
    "I'll need to inspect any electronic devices found within the vehicle.",
    "Your vehicle's color and make match a description we've been given.",
    "For the next part of the inspection, I may need some additional tools.",
    "We're verifying the ownership of all high-value items in the vehicle.",
    "I'm checking for compliance with the latest safety regulations.",
    "Your patience is appreciated while we ensure everything is in order.",
    "The vehicle's condition suggests it might have been used for specific activities.",
    "We're collecting evidence as part of a larger investigation.",
    "I'll be looking for any modifications that might not comply with regulations.",
    "This inspection helps us ensure that all vehicles are safe and legal.",
    "Please confirm whether you've given anyone else access to your vehicle recently.",
    "We're nearly through; just a few more checks to complete.",
    "Your vehicle's type has been associated with certain risks, requiring a detailed check.",
    "I'll need to consult with my supervisor on a few details about this search.",
    "We're working to prevent and deter illegal activities in this area.",
    "This procedure is part of our commitment to public safety.",
    "I'm finalizing the report on this search; thank you for your cooperation.",
]

irrelevant_sentences = [
    'Fold in the cream cheese',
    "I'm hungry",
    "Back off! Im not joking around.",
    "Let's go buy a backpack"
]

# Combining original and additional sentences into one list
all_sentences = transcript_sentences + generated_sentences + irrelevant_sentences


### Feature Importance Process:

In [None]:
# Create embeddings for all candidate sentences
for sentence in all_sentences:
    target_sentence = sentence
    data_final = find_closest_sentences_sbert(data_encoded_sbert, 'transcript','embeddings', target_sentence, model)

In [42]:
# Create one dataframe with video info AND labels
def extract_filename(filepath):
    """
    Function to extract the string after the last '/' in a sequence.
    """
    return filepath.split('/')[-1]

# Apply the function to the 'filepath' column of data_final
data_final['filepath'] = data_final['filepath'].apply(extract_filename)

# Apply the function to the 'File' column of labels
labels['File'] = labels['File'].apply(extract_filename)

merged_df = data_final.merge(labels, left_on='filepath', right_on='File', how='inner')

In [43]:
# Clean merged_df
merged_df = merged_df.drop('Dylan Confirmation Needed', axis = 1)
merged_df = merged_df.loc[:, ~merged_df.columns.str.startswith('Unnamed')]
merged_df = merged_df.loc[:, ~merged_df.columns.str.endswith('_text')]

### Extract additional features

In [44]:
# create feature for number of question marks
def count_questions_and_sentences(row):
    # Counting the number of question marks
    num_questions = row['transcript'].count('?')
    
    # Counting the number of sentences. Assuming sentences end with '.', '!', or '?'
    num_sentences = sum(row['transcript'].count(marker) for marker in ['.', '!', '?'])
    
    return pd.Series([num_questions, num_sentences], index=['num_questions', 'num_sentences'])

# Apply the function along the rows (axis=1) and create new columns in the DataFrame
merged_df[['num_questions', 'num_sentences']] = merged_df.apply(count_questions_and_sentences, axis=1)


In [None]:
# Calculate aggregated embeddings

merged_df['mean_embedding'] = ''
merged_df['sum_embedding'] = ''
merged_df['mean_embeddings_final_5'] = ''
merged_df['mean_embeddings_first_5'] = ''
for i in range(len(merged_df)):
    merged_df['mean_embedding'][i] = np.mean(merged_df['embeddings'][i])
    merged_df['sum_embedding'][i] = np.sum(merged_df['embeddings'][i])
    merged_df['mean_embeddings_final_5'][i] = np.mean(merged_df['embeddings'][i][-5:])
    merged_df['mean_embeddings_first_5'][i] = np.mean(merged_df['embeddings'][i][:5])

In [46]:
# Add counts for keywords

def count_keywords_in_transcripts_case_insensitive(dataframe, keywords):
    # Adjusting the count_keywords function to correctly handle case-insensitive search
    def count_keywords_case_insensitive(text, keywords):
        text = text.lower()  # Convert text to lowercase
        return {keyword: text.count(keyword.lower()) for keyword in keywords}

    # Apply the counting function to the 'transcript' column and separate the counts into new columns
    for keyword in keywords:
        dataframe[keyword + '_count'] = dataframe['transcript'].apply(lambda x: count_keywords_case_insensitive(x, [keyword])[keyword])

    return dataframe

# Applying the corrected function
keywords = ['confiscated', 'confiscate', 'search', 'marijuana', 'consent', 'weed', 'look', 'open', 'trunk']
merged_df = count_keywords_in_transcripts_case_insensitive(merged_df, keywords)


In [47]:
data = merged_df.copy()

data.fillna(0, inplace=True)

X = data.drop(['Search?', 'transcript', 'embeddings', 'File'], axis = 1)

y = data['Search?']

X_train_files, X_test_files, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state =13)

X_train = X_train_files.drop('filepath', axis = 1)
X_test = X_test_files.drop('filepath', axis = 1)

In [48]:
# first let's rescale our data
scaler = StandardScaler()

# Fit the scaler to the data and transform it
x_train_sc_array = scaler.fit_transform(X_train)
x_test_sc_array = scaler.transform(X_test)

# Convert the scaled array back to a DataFrame
x_train_sc = pd.DataFrame(x_train_sc_array, columns=X_train.columns)
x_test_sc = pd.DataFrame(x_test_sc_array, columns=X_test.columns)

In [49]:
merged_df = merged_df.drop('transcript', axis = 1)

In [90]:
# Let's write a loop that selects only features that are most frequently relevant across different train/test splits
relevant_features = []
for _ in range(50000):
    # Train test split
    data = merged_df.copy()
    data.fillna(0, inplace=True)
    X = data.drop(['Search?', 'filepath', 'File', 'embeddings'], axis = 1)
    y = data['Search?']
    X_train_loop, X_test_loop, y_train_loop, y_test_loop = train_test_split(X, y, test_size=0.25) # don't add a random state here!

    # Scale
    scaler = StandardScaler()
    x_train_sc_array = scaler.fit_transform(X_train_loop)
    x_train_sc_loop = pd.DataFrame(x_train_sc_array, columns=X_train_loop.columns)

    # Fit the logistic regression model with L1 regularization
    model = LogisticRegression(penalty='l1', solver='liblinear')  # 'liblinear' solver supports L1 penalty
    model.fit(x_train_sc_loop, y_train_loop)

    # Get indices of non-zero coefficients
    non_zero_indices = [i for i, coef in enumerate(model.coef_.flatten()) if coef != 0]

    # Map indices to column names
    selected_feature_names = x_train_sc_loop.columns[non_zero_indices]

    temporary_list = selected_feature_names.tolist()

    relevant_features += temporary_list

In [91]:
x = pd.DataFrame(relevant_features).value_counts()[:10] # pick top ten phrases
x = x.reset_index()
significant_features = list(x[0])
significant_features

["You_mind_if_we_go_in_there_and_get_whatever_you're_talking_about?_cosine_distance",
 'This_is_standard_procedure,_we_do_this_for_all_traffic_stops_in_this_area._cosine_distance',
 'trunk_count',
 'Can_you_pop_the_thing_so_I_can_look_at_that_sticker_just_to_make_sure_that_matches_the_paperwork?_cosine_distance',
 'We_found_a_bottle_of_pills_in_the_glove_box._cosine_distance',
 "When_you_get_out,_I'll_pat_you_down_cosine_distance",
 'open_count',
 'Go_glove_up._cosine_distance',
 'Can_you_open_your_driver_door_for_me_so_I_can_take_a_look_at_the_door_tag?_cosine_distance',
 'Did_you_search_him?_cosine_distance']

In [92]:
# Filter the original X_train DataFrame to keep only significant features
X_train_filtered = X_train[significant_features]

# Filter the scaled x_train_sc DataFrame similarly
x_train_sc_filtered = x_train_sc[significant_features]

x_test_sc_filtered = x_test_sc[significant_features]

# After determining most predictive features, train a logistic regression classifier

In [93]:
def fit_logistic_and_find_best_score(X_train: pd.DataFrame, 
                                     y_train,
                                     alphas: np.array = [1e-3, 1e-3, 1e-2, 1e-1, 1, 5, 10],
                                     n_splits: int = 10,
                                     random_state: int = 13) -> dict:
    best_alpha = alphas[0]
    best_recall = 0  # Focus on recall for optimization
    best_accuracy = 0
    best_precision = 0
    best_coefficients = None

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for alpha in alphas:
        fold_accuracies = []
        fold_precisions = []
        fold_recalls = []
        
        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            # Scale for each fold
            scaler = StandardScaler()
            X_train_fold_norm = scaler.fit_transform(X_train_fold)
            X_val_fold_norm = scaler.transform(X_val_fold)
            
            # Logistic Regression with L1 penalty
            # Define model (be sure to balance imbalanced data)
            model = LogisticRegression(penalty='l1', C=1/alpha, solver='liblinear', random_state=random_state, class_weight='balanced', max_iter=1000)
            model.fit(X_train_fold_norm, y_train_fold)
            predictions = model.predict(X_val_fold_norm)
            
            # Calculate and store each metric
            fold_accuracies.append(accuracy_score(y_val_fold, predictions))
            fold_precisions.append(precision_score(y_val_fold, predictions, zero_division=0))
            fold_recalls.append(recall_score(y_val_fold, predictions, zero_division=1))


        # Compute mean of each metric across folds
        mean_accuracy = np.mean(fold_accuracies)
        mean_precision = np.mean(fold_precisions)
        mean_recall = np.mean(fold_recalls)

        # Update best scores and alpha if current mean recall is higher
        if mean_recall > best_recall:
            best_recall = mean_recall
            best_accuracy = mean_accuracy
            best_precision = mean_precision
            best_alpha = alpha
            best_coefficients = model.coef_[0]

    # Combine feature names with coefficients
    feature_coeff_tuples = list(zip(X_train.columns, best_coefficients))

    # Return a dictionary of best scores, alpha, and coefficients
    return {
        'best_alpha': best_alpha,
        'best_accuracy': best_accuracy,
        'best_precision': best_precision,
        'best_recall': best_recall,
        'feature_coefficients': feature_coeff_tuples
    }


In [94]:
y_train.reset_index(drop = True, inplace = True)
best_model_info_L1 = fit_logistic_and_find_best_score(x_train_sc_filtered, y_train)
best_model_info_L1

{'best_alpha': 1,
 'best_accuracy': 0.9294117647058823,
 'best_precision': 0.55,
 'best_recall': 0.925,
 'feature_coefficients': [("You_mind_if_we_go_in_there_and_get_whatever_you're_talking_about?_cosine_distance",
   0.39437145893747116),
  ('This_is_standard_procedure,_we_do_this_for_all_traffic_stops_in_this_area._cosine_distance',
   -2.6541080301863835),
  ('trunk_count', 0.3633987877160385),
  ('Can_you_pop_the_thing_so_I_can_look_at_that_sticker_just_to_make_sure_that_matches_the_paperwork?_cosine_distance',
   1.1002492037698932),
  ('We_found_a_bottle_of_pills_in_the_glove_box._cosine_distance',
   1.1656974866001961),
  ("When_you_get_out,_I'll_pat_you_down_cosine_distance", 1.0476811063132376),
  ('open_count', -0.39963372864018526),
  ('Go_glove_up._cosine_distance', 1.6986242739894177),
  ('Can_you_open_your_driver_door_for_me_so_I_can_take_a_look_at_the_door_tag?_cosine_distance',
   0.9038448003798306),
  ('Did_you_search_him?_cosine_distance', 0.4483260856268138)]}

In [95]:
non_zero_coefficients = []
for element in best_model_info_L1['feature_coefficients']:
    if element[1] != 0:
        non_zero_coefficients.append(element[0])

In [96]:
non_zero_coefficients

["You_mind_if_we_go_in_there_and_get_whatever_you're_talking_about?_cosine_distance",
 'This_is_standard_procedure,_we_do_this_for_all_traffic_stops_in_this_area._cosine_distance',
 'trunk_count',
 'Can_you_pop_the_thing_so_I_can_look_at_that_sticker_just_to_make_sure_that_matches_the_paperwork?_cosine_distance',
 'We_found_a_bottle_of_pills_in_the_glove_box._cosine_distance',
 "When_you_get_out,_I'll_pat_you_down_cosine_distance",
 'open_count',
 'Go_glove_up._cosine_distance',
 'Can_you_open_your_driver_door_for_me_so_I_can_take_a_look_at_the_door_tag?_cosine_distance',
 'Did_you_search_him?_cosine_distance']

In [97]:
# Filter the original X_train DataFrame to keep only significant features
X_train_filtered = X_train[non_zero_coefficients]

# Filter the scaled x_train_sc DataFrame similarly
x_train_sc_filtered = x_train_sc[non_zero_coefficients]

x_test_sc_filtered = x_test_sc[non_zero_coefficients]

In [98]:
best_model_info_L1 = fit_logistic_and_find_best_score(x_train_sc_filtered, y_train)
best_model_info_L1

{'best_alpha': 1,
 'best_accuracy': 0.9294117647058823,
 'best_precision': 0.55,
 'best_recall': 0.925,
 'feature_coefficients': [("You_mind_if_we_go_in_there_and_get_whatever_you're_talking_about?_cosine_distance",
   0.39437145893747116),
  ('This_is_standard_procedure,_we_do_this_for_all_traffic_stops_in_this_area._cosine_distance',
   -2.6541080301863835),
  ('trunk_count', 0.3633987877160385),
  ('Can_you_pop_the_thing_so_I_can_look_at_that_sticker_just_to_make_sure_that_matches_the_paperwork?_cosine_distance',
   1.1002492037698932),
  ('We_found_a_bottle_of_pills_in_the_glove_box._cosine_distance',
   1.1656974866001961),
  ("When_you_get_out,_I'll_pat_you_down_cosine_distance", 1.0476811063132376),
  ('open_count', -0.39963372864018526),
  ('Go_glove_up._cosine_distance', 1.6986242739894177),
  ('Can_you_open_your_driver_door_for_me_so_I_can_take_a_look_at_the_door_tag?_cosine_distance',
   0.9038448003798306),
  ('Did_you_search_him?_cosine_distance', 0.4483260856268138)]}

In [101]:
best_alpha = best_model_info_L1['best_alpha']
model = LogisticRegression(penalty='l1', C=1/best_alpha, solver='liblinear', random_state=32, class_weight='balanced')
model.fit(x_train_sc_filtered, y_train)

# Get probability estimates for the test data
probabilities = model.predict_proba(x_train_sc_filtered)

# Apply the custom threshold of 0.2 to the positive class's probability estimates
custom_threshold = 0.3
predictions_custom_threshold = (probabilities[:, 1] > custom_threshold).astype(int)

# Now you can evaluate your model using these custom predictions
test_accuracy_custom = accuracy_score(y_train, predictions_custom_threshold)
test_precision_custom = precision_score(y_train, predictions_custom_threshold, zero_division=0)
test_recall_custom = recall_score(y_train, predictions_custom_threshold, zero_division=0)

# Print the evaluation metrics
print(f'Train Accuracy with custom threshold: {test_accuracy_custom}')
print(f'Train Precision with custom threshold: {test_precision_custom}')
print(f'Train Recall with custom threshold: {test_recall_custom}')

Train Accuracy with custom threshold: 0.9053254437869822
Train Precision with custom threshold: 0.5
Train Recall with custom threshold: 1.0


# Finally, run trained model on unseen test data

In [102]:
# Get probability estimates for the test data
probabilities = model.predict_proba(x_test_sc_filtered)

# Apply the custom threshold of 0.4 to the positive class's probability estimates
custom_threshold = 0.3
predictions_custom_threshold = (probabilities[:, 1] > custom_threshold).astype(int)

# Now you can evaluate your model using these custom predictions
test_accuracy_custom = accuracy_score(y_test, predictions_custom_threshold)
test_precision_custom = precision_score(y_test, predictions_custom_threshold, zero_division=0)
test_recall_custom = recall_score(y_test, predictions_custom_threshold, zero_division=0)

# Print the evaluation metrics
print(f'Test Accuracy with custom threshold: {test_accuracy_custom}')
print(f'Test Precision with custom threshold: {test_precision_custom}')
print(f'Test Recall with custom threshold: {test_recall_custom}')


Test Accuracy with custom threshold: 0.8771929824561403
Test Precision with custom threshold: 0.45454545454545453
Test Recall with custom threshold: 0.8333333333333334
