In [1]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
from tqdm import tqdm
import pickle
from ethnicseer import EthnicClassifier

tqdm.pandas(desc="Processing")

In [2]:
indivs20 = "./data/CampaignFin20/indivs20.txt"
indivs22 = "./data/CampaignFin22/indivs22.txt"
names = "./data/USIN.csv"
model_path = "./models/logit_classifier.pkl"

In [3]:
def calculate_stats(results_test, results_pred, name, display_labels):
    cm = confusion_matrix(results_test, results_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix ({name})')
    plt.show()

    print(f"\n{name}")
    if len(display_labels) > 2:
        accuracy = accuracy_score(results_test, results_pred)
        precision = precision_score(results_test, results_pred, average='weighted')
        recall = recall_score(results_test, results_pred, average='weighted')
        f1 = f1_score(results_test, results_pred, average='weighted')
    else:
        accuracy = accuracy_score(results_test, results_pred)
        precision = precision_score(results_test, results_pred, pos_label='ind')
        recall = recall_score(results_test, results_pred, pos_label='ind')
        f1 = f1_score(results_test, results_pred, pos_label='ind')
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1:", f1)


In [4]:
def visualize_embeddings(X, y, names=None, n_samples=1000, method='tsne'):
    
    if len(X) > n_samples:
        idx = np.random.choice(len(X), n_samples, replace=False)
        X_sample = X[idx]
        y_sample = y[idx]
        if names is not None:
            names_sample = [names[i] for i in idx]
        else:
            names_sample = None
    else:
        X_sample = X
        y_sample = y
    
    if method == 'tsne':
        # Use t-SNE for dimensionality reduction
        reducer = TSNE(n_components=3, random_state=42)
    else:
        # Use PCA as fallback
        reducer = PCA(n_components=3)
    
    embeddings = reducer.fit_transform(X_sample)
    
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    
    unique_classes = np.unique(y_sample)
    colors = sns.color_palette("husl", len(unique_classes))
    
    for i, cls in enumerate(unique_classes):
        mask = (y_sample == cls)
        ax.scatter(embeddings[mask, 0], embeddings[mask, 1], embeddings[mask, 2],
                   color=colors[i], label=cls, alpha=0.6)
    
    ax.set_title('3D Visualization of Name Embeddings')
    ax.legend()
    plt.show()
    
    
    plt.figure(figsize=(10, 8))
    for i, cls in enumerate(unique_classes):
        mask = (y_sample == cls)
        plt.scatter(embeddings[mask, 0], embeddings[mask, 1],
                    color=colors[i], label=cls, alpha=0.6)
    
    plt.title('2D Visualization of Name Embeddings')
    plt.legend()
    plt.show()
    
    
    if names_sample is not None:
        plt.figure(figsize=(12, 10))
        ax = plt.gca()
        
        # First plot the points
        for i, cls in enumerate(unique_classes):
            mask = (y_sample == cls)
            ax.scatter(embeddings[mask, 0], embeddings[mask, 1],
                        color=colors[i], label=cls, alpha=0.6)
        
        # Then add text labels
        for i, (x, y) in enumerate(zip(embeddings[:, 0], embeddings[:, 1])):
            ax.text(x, y, names_sample[i], fontsize=8, alpha=0.7,
                    bbox=dict(facecolor='white', alpha=0.5, edgecolor='none', pad=0.5))
        
        plt.title('2D Visualization with Name Labels')
        plt.legend()
        plt.show()

In [5]:
def preprocess_name(name):
    parts = name.strip().split()
    first_name = parts[0].lower() if len(parts) > 0 else ""
    last_name = parts[-1].lower() if len(parts) > 1 else ""
    
    f4_first = first_name[:4] if len(first_name) >= 4 else first_name
    l4_first = first_name[-4:] if len(first_name) >= 4 else first_name
    f4_last = last_name[:4] if len(last_name) >= 4 else last_name
    l4_last = last_name[-4:] if len(last_name) >= 4 else last_name
    
    n_sub_names = len(parts)
    has_dash = any('-' in part for part in parts)
    
    return {
        'first_name': first_name,
        'last_name': last_name,
        'f4_first': f4_first,
        'l4_first': l4_first,
        'f4_last': f4_last,
        'l4_last': l4_last,
        'n_sub_names': min(n_sub_names, 4),
        'has_dash': int(has_dash)
    }


In [6]:
def build_name_stats(df, name_col='name', ethnicity_col='ethnic'):
    first_name_stats = defaultdict(lambda: defaultdict(int))
    last_name_stats = defaultdict(lambda: defaultdict(int))
    f4_first_stats = defaultdict(lambda: defaultdict(int))
    l4_first_stats = defaultdict(lambda: defaultdict(int))
    f4_last_stats = defaultdict(lambda: defaultdict(int))
    l4_last_stats = defaultdict(lambda: defaultdict(int))
    
    for _, row in df.iterrows():
        name_info = preprocess_name(row[name_col])
        ethnicity = row[ethnicity_col]
        
        first_name_stats[name_info['first_name']][ethnicity] += 1
        last_name_stats[name_info['last_name']][ethnicity] += 1
        f4_first_stats[name_info['f4_first']][ethnicity] += 1
        l4_first_stats[name_info['l4_first']][ethnicity] += 1
        f4_last_stats[name_info['f4_last']][ethnicity] += 1
        l4_last_stats[name_info['l4_last']][ethnicity] += 1
    
    return {
        'first_name_stats': first_name_stats,
        'last_name_stats': last_name_stats,
        'f4_first_stats': f4_first_stats,
        'l4_first_stats': l4_first_stats,
        'f4_last_stats': f4_last_stats,
        'l4_last_stats': l4_last_stats
    }


In [7]:
def create_features(name, stats, cats=['ind', 'not']):
    name_info = preprocess_name(name)
    features = {}
    
    for eth in cats:
        fn_counts = stats['first_name_stats'][name_info['first_name']]
        total_fn = sum(fn_counts.values())
        features[f'probability_{eth}_first_name'] = fn_counts.get(eth, 0) / (total_fn + 1)
        
        ln_counts = stats['last_name_stats'][name_info['last_name']]
        total_ln = sum(ln_counts.values())
        features[f'probability_{eth}_last_name'] = ln_counts.get(eth, 0) / (total_ln + 1)
        
        f4f_counts = stats['f4_first_stats'][name_info['f4_first']]
        total_f4f = sum(f4f_counts.values())
        features[f'probability_{eth}_first_name_f4'] = f4f_counts.get(eth, 0) / (total_f4f + 1)
        
        l4f_counts = stats['l4_first_stats'][name_info['l4_first']]
        total_l4f = sum(l4f_counts.values())
        features[f'probability_{eth}_first_name_l4'] = l4f_counts.get(eth, 0) / (total_l4f + 1)
        
        f4l_counts = stats['f4_last_stats'][name_info['f4_last']]
        total_f4l = sum(f4l_counts.values())
        features[f'probability_{eth}_last_name_f4'] = f4l_counts.get(eth, 0) / (total_f4l + 1)
        
        l4l_counts = stats['l4_last_stats'][name_info['l4_last']]
        total_l4l = sum(l4l_counts.values())
        features[f'probability_{eth}_last_name_l4'] = l4l_counts.get(eth, 0) / (total_l4l + 1)
        
        features[f'best_evidence_{eth}'] = max(
            features[f'probability_{eth}_first_name'],
            features[f'probability_{eth}_last_name']
        )
    
    features['dash_indicator'] = name_info['has_dash']
    features['n_sub_names'] = name_info['n_sub_names']
    
    return features


In [8]:
def is_indistinguishable(name, stats, threshold=0.15):
    
    name_info = preprocess_name(name)
    features = create_features(name, stats)
    
    cats = sorted(stats['first_name_stats'][name_info['first_name']].keys())
    psi = {cat: features[f'probability_{cat}_first_name'] for cat in cats}
    phi = {cat: features[f'probability_{cat}_last_name'] for cat in cats}
    
    indistinguishable_pairs = []
    
    for i, r1 in enumerate(cats):
        for r2 in cats[i+1:]:
            
            condition1 = (abs(psi[r1] - psi[r2]) <= threshold and 
                         abs(phi[r1] - phi[r2]) <= threshold)
            
            max_psi = max(psi.values())
            max_phi = max(phi.values())
            condition2 = (max_psi - min(psi[r1], psi[r2]) <= threshold and 
                         max_phi - min(phi[r1], phi[r2]) <= threshold)
            
            if condition1 and condition2:
                indistinguishable_pairs.append(f"{r1}-{r2}")
    
    return indistinguishable_pairs if indistinguishable_pairs else None


In [9]:
def handle_indistinguishables(df, stats, name_col='name'):
    
    df['indistinguishable'] = None
    
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        name = row[name_col]
        indistinguishable = is_indistinguishable(name, stats)
        
        if indistinguishable:
            df.at[idx, 'indistinguishable'] = ','.join(indistinguishable)
    
    return df


In [10]:
def process_data(df, name_col='name', ethnicity_col='ethnic', train_size=100000):
    
    stats = build_name_stats(df, name_col, ethnicity_col)
    
    class_counts = df[ethnicity_col].value_counts()
    class_weights = {class_counts.index[i]: sum(class_counts)/count for i, count in enumerate(class_counts)}
    
    X, y = [], []
    for _, row in tqdm(df.iterrows()):
        features = create_features(row[name_col], stats)
        X.append(list(features.values()))
        y.append(row[ethnicity_col])
    
    X, y = np.array(X), np.array(y)
    np.random.seed(42)
    indices = np.random.permutation(len(X))
    
    train_indices = indices[:train_size]
    test_indices = indices[train_size:]
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    visualize_embeddings(X, y, list(df[name_col]))
    
    return X_train, X_test, y_train, y_test, class_weights
    

In [11]:
def prepare_train_test_data(df_names, df, name_col = "name", ethnicity_col = "ethnic", train_size = 0,
):
    
    class_counts = df_names[ethnicity_col].value_counts()
    class_weights = {class_counts.index[i]: sum(class_counts)/count for i, count in enumerate(class_counts)}
    
    stats = build_name_stats(df_names, name_col, ethnicity_col)
    
    X = []
    for row in tqdm(df.iter_rows(named=True), total=len(df)):
        features = create_features(row["name_new"], stats)
        X.append(list(features.values()))
    
    X = np.array(X)
    np.random.seed(42)
    indices = np.random.permutation(len(X))
    
    split_idx = int(train_size * len(X))
    train_indices = indices[:split_idx]
    test_indices = indices[split_idx:]
    
    X_train, X_test = X[train_indices], X[test_indices]
    
    return X_train, X_test, class_weights

In [12]:
def name_classifier(df, X_train, X_test, y_train, y_test, class_weights, name_col='name', ethnicity_col='ethnic'):
    
    model = LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=1000,
        class_weight=class_weights
    )
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    cats = sorted(df[ethnicity_col].unique())
    
    calculate_stats(y_test, y_pred, "Name Classifier", cats)
    
    return model


In [13]:
with open('./models/logit_classifier.pkl', 'rb') as file:
    model = pickle.load(file)

In [14]:
df = pd.read_csv(names)
df["ethnic"].value_counts()

ethnic
not    6994941
ind    3005059
Name: count, dtype: int64

In [15]:
lf20 = (
        pl.scan_csv(
            indivs20,
            separator=',', 
            quote_char='|', 
            encoding='utf8-lossy', 
            has_header=False,
            new_columns=['dummy1', 'dummy2', 'contrib_id', 'name', 'recip_id', 
                        'orgname', 'ultorg', 'realcode', 'dummy3', 'amount', 
                        'street', 'city', 'state', 'zip', 'recipcode', 'type', 'dummy4', 'dummy5', 'gender', 'dummy6', 'occupation', 'employer', 'dummy7'],
            schema_overrides={'amount': pl.Float64, 'name': pl.Utf8, 'state': pl.Utf8, 'city': pl.Utf8},
            ignore_errors=True
        )
        .select(['contrib_id', 'name', 'recip_id', 'orgname', 'ultorg', 'realcode', 
                 'amount', 'street', 'city', 'state', 'zip', 'recipcode', 'type', 
                 'gender', 'occupation', 'employer'])
        .filter(~pl.col('amount').is_null())
    )
df20 = lf20.collect()
print(df20.head(10))

shape: (10, 16)
┌─────────────┬─────────────┬───────────┬────────────┬───┬──────┬────────┬────────────┬────────────┐
│ contrib_id  ┆ name        ┆ recip_id  ┆ orgname    ┆ … ┆ type ┆ gender ┆ occupation ┆ employer   │
│ ---         ┆ ---         ┆ ---       ┆ ---        ┆   ┆ ---  ┆ ---    ┆ ---        ┆ ---        │
│ str         ┆ str         ┆ str       ┆ str        ┆   ┆ str  ┆ str    ┆ str        ┆ str        │
╞═════════════╪═════════════╪═══════════╪════════════╪═══╪══════╪════════╪════════════╪════════════╡
│ p0004869853 ┆ LONNBERG,   ┆ C00721712 ┆ [24T Contr ┆ … ┆ 24T  ┆ M      ┆ PARTNER    ┆ BOSTON     │
│             ┆ CARL        ┆           ┆ ibution]   ┆   ┆      ┆        ┆            ┆ CONSULTING │
│             ┆             ┆           ┆            ┆   ┆      ┆        ┆            ┆ GROUP      │
│ k0001516259 ┆ LOVO, MARIO ┆ N00044240 ┆ [24T Contr ┆ … ┆ 24T  ┆ M      ┆ LAWYER     ┆ SELF       │
│             ┆             ┆           ┆ ibution]   ┆   ┆      ┆        ┆ 

In [None]:
lf22 = (
        pl.scan_csv(
            indivs22,
            separator=',', 
            quote_char='|', 
            encoding='utf8-lossy', 
            has_header=False,
            new_columns=['dummy1', 'dummy2', 'contrib_id', 'name', 'recip_id', 
                        'orgname', 'ultorg', 'realcode', 'dummy3', 'amount', 
                        'street', 'city', 'state', 'zip', 'recipcode', 'type', 'dummy4', 'dummy5', 'gender', 'dummy6', 'occupation', 'employer', 'dummy7'],
            schema_overrides={'amount': pl.Float64, 'name': pl.Utf8, 'state': pl.Utf8, 'city': pl.Utf8},
            ignore_errors=True
        )
        .select(['contrib_id', 'name', 'recip_id', 'orgname', 'ultorg', 'realcode', 
                 'amount', 'street', 'city', 'state', 'zip', 'recipcode', 'type', 
                 'gender', 'occupation', 'employer'])
        .filter(~pl.col('amount').is_null())
    )
df22 = lf22.collect()
print(df22.head(10))

shape: (10, 16)
┌─────────────┬─────────────┬───────────┬────────────┬───┬──────┬────────┬────────────┬────────────┐
│ contrib_id  ┆ name        ┆ recip_id  ┆ orgname    ┆ … ┆ type ┆ gender ┆ occupation ┆ employer   │
│ ---         ┆ ---         ┆ ---       ┆ ---        ┆   ┆ ---  ┆ ---    ┆ ---        ┆ ---        │
│ str         ┆ str         ┆ str       ┆ str        ┆   ┆ str  ┆ str    ┆ str        ┆ str        │
╞═════════════╪═════════════╪═══════════╪════════════╪═══╪══════╪════════╪════════════╪════════════╡
│ r0014256510 ┆ DILLARD,    ┆ C00000935 ┆ [24T Contr ┆ … ┆ 24T  ┆ M      ┆ NOT        ┆ NOT        │
│             ┆ DANIEL      ┆           ┆ ibution]   ┆   ┆      ┆        ┆ EMPLOYED   ┆ EMPLOYED   │
│ r0015503614 ┆ WHITE,      ┆ C00633404 ┆ [24T Contr ┆ … ┆ 24T  ┆ M      ┆ NOT        ┆ NOT        │
│             ┆ SCOTTO      ┆           ┆ ibution]   ┆   ┆      ┆        ┆ EMPLOYED   ┆ EMPLOYED   │
│ p0003861308 ┆ DOMINGUEZ,  ┆ C00632398 ┆ [24T Contr ┆ … ┆ 24T  ┆ F      ┆ 

In [16]:
df20 = df20.with_columns([
    pl.col("name").str.split(",").list.get(-1).str.to_lowercase().str.strip_chars().alias("firstname"),
    pl.col("name").str.split(",").list.first().str.to_lowercase().str.strip_chars().alias("lastname"),
])

df20 = df20.with_columns([
    (
        pl.col("firstname").str.to_lowercase().str.strip_chars() + " " + 
        pl.col("lastname").str.to_lowercase().str.strip_chars()
    ).alias("name_new")
])

print(df20.head(10))

shape: (10, 19)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬──────────┬───────────┐
│ contrib_i ┆ name      ┆ recip_id  ┆ orgname   ┆ … ┆ employer  ┆ firstname ┆ lastname ┆ name_new  │
│ d         ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---      ┆ ---       │
│ ---       ┆ str       ┆ str       ┆ str       ┆   ┆ str       ┆ str       ┆ str      ┆ str       │
│ str       ┆           ┆           ┆           ┆   ┆           ┆           ┆          ┆           │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪══════════╪═══════════╡
│ p00048698 ┆ LONNBERG, ┆ C00721712 ┆ [24T Cont ┆ … ┆ BOSTON    ┆ carl      ┆ lonnberg ┆ carl      │
│ 53        ┆ CARL      ┆           ┆ ribution] ┆   ┆ CONSULTIN ┆           ┆          ┆ lonnberg  │
│           ┆           ┆           ┆           ┆   ┆ G GROUP   ┆           ┆          ┆           │
│ k00015162 ┆ LOVO,     ┆ N00044240 ┆ [24T Cont ┆ … ┆ SELF      ┆ mario    

In [None]:
df22 = df22.with_columns([
    pl.col("name").str.split(",").list.get(-1).str.to_lowercase().str.strip_chars().alias("firstname"),
    pl.col("name").str.split(",").list.first().str.to_lowercase().str.strip_chars().alias("lastname"),
])

df22 = df22.with_columns([
    (
        pl.col("firstname").str.to_lowercase().str.strip_chars() + " " + 
        pl.col("lastname").str.to_lowercase().str.strip_chars()
    ).alias("name_new")
])

print(df22.head(10))

shape: (10, 19)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ contrib_i ┆ name      ┆ recip_id  ┆ orgname   ┆ … ┆ employer  ┆ firstname ┆ lastname  ┆ name_new │
│ d         ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│ ---       ┆ str       ┆ str       ┆ str       ┆   ┆ str       ┆ str       ┆ str       ┆ str      │
│ str       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ r00142565 ┆ DILLARD,  ┆ C00000935 ┆ [24T Cont ┆ … ┆ NOT       ┆ daniel    ┆ dillard   ┆ daniel   │
│ 10        ┆ DANIEL    ┆           ┆ ribution] ┆   ┆ EMPLOYED  ┆           ┆           ┆ dillard  │
│ r00155036 ┆ WHITE,    ┆ C00633404 ┆ [24T Cont ┆ … ┆ NOT       ┆ scotto    ┆ white     ┆ scotto   │
│ 14        ┆ SCOTTO    ┆           ┆ ribution] ┆   ┆ EMPLOYED  ┆          

In [17]:
donors20 = (
    df20.group_by("contrib_id")
    .agg(
        pl.col("name").first().alias("name"),
        pl.col("name_new").first().alias("name_new"),
        pl.col("lastname").first().alias("lastname"),
        pl.col("amount").sum().alias("total_donated"),
        pl.col("amount").count().alias("donation_count"),
        pl.col("amount").mean().alias("avg_donation")
    )
    .sort("total_donated", descending=True)
)
print(donors20.head(10))

shape: (10, 7)
┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ contrib_id   ┆ name        ┆ name_new    ┆ lastname    ┆ total_donat ┆ donation_co ┆ avg_donatio │
│ ---          ┆ ---         ┆ ---         ┆ ---         ┆ ed          ┆ unt         ┆ n           │
│ str          ┆ str         ┆ str         ┆ str         ┆ ---         ┆ ---         ┆ ---         │
│              ┆             ┆             ┆             ┆ f64         ┆ u32         ┆ f64         │
╞══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│              ┆ FOR         ┆ amy for     ┆ for america ┆ 1.2556e9    ┆ 27780       ┆ 45197.81353 │
│              ┆ AMERICA,    ┆ america     ┆             ┆             ┆             ┆ 5           │
│              ┆ AMY         ┆             ┆             ┆             ┆             ┆             │
│ U00000037041 ┆ BLOOMBERG,  ┆ michael     ┆ bloomberg   ┆ 1.1277e9    ┆ 960

In [18]:
X_train, X_test, weights = prepare_train_test_data(df, donors20[["name_new"]])

100%|██████████| 3847755/3847755 [01:09<00:00, 55187.21it/s]


In [19]:
y_probs = model.predict_proba(X_test)[:, 1]
y_probs.mean(), y_probs.std()

(0.957804796987511, 0.13321386679631453)

In [22]:
y_pred_adjusted = (y_probs <= 0.80).astype(int)
y_pred_adjusted

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
donors20 = donors20.with_columns(pl.Series(name="ethnic", values=y_pred_adjusted))
donors20 = donors20.with_columns(pl.Series(name="probs", values=y_probs))
donors20.write_csv("./data/donors20_with_pred80.csv")
donors20["ethnic"].value_counts()

ethnic,count
i32,u32
1,327213
0,3520542


In [43]:
print(donors20.filter(pl.col("lastname") == "khosla").head(10))

shape: (10, 9)
┌────────────┬────────────┬────────────┬──────────┬───┬────────────┬───────────┬────────┬──────────┐
│ contrib_id ┆ name       ┆ name_new   ┆ lastname ┆ … ┆ donation_c ┆ avg_donat ┆ ethnic ┆ probs    │
│ ---        ┆ ---        ┆ ---        ┆ ---      ┆   ┆ ount       ┆ ion       ┆ ---    ┆ ---      │
│ str        ┆ str        ┆ str        ┆ str      ┆   ┆ ---        ┆ ---       ┆ i32    ┆ f64      │
│            ┆            ┆            ┆          ┆   ┆ u32        ┆ f64       ┆        ┆          │
╞════════════╪════════════╪════════════╪══════════╪═══╪════════════╪═══════════╪════════╪══════════╡
│ U000000330 ┆ KHOSLA,    ┆ vinod      ┆ khosla   ┆ … ┆ 56         ┆ 43049.910 ┆ 0      ┆ 0.996533 │
│ 11         ┆ VINOD      ┆ khosla     ┆          ┆   ┆            ┆ 714       ┆        ┆          │
│ k000155328 ┆ KHOSLA,    ┆ gail       ┆ khosla   ┆ … ┆ 4          ┆ 26400.0   ┆ 0      ┆ 0.99645  │
│ 9@         ┆ GAIL       ┆ khosla     ┆          ┆   ┆            ┆        