In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn
import seaborn as sn
import datetime
import random
from datetime import date
from pprint import pprint
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import statistics 
from statistics import mode
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
def convert_strings_to_dates(dataframe, testset=False):
    frame = dataframe.copy()
    if testset:
        frame.dt = [date.fromisoformat(d) for d in frame.dt]
        frame.first_load_date = [date.fromisoformat(d) for d in frame.first_load_date]
        frame.ts_signup = [datetime.datetime.strptime(d, '%Y-%m-%d %H:%M:%S+00:00') for d in frame.ts_signup]
        frame.ts_signup = [date(year=d.year, month=d.month, day=d.day) for d in frame.ts_signup]
    else:
        frame.dt = [date.fromisoformat(d) for d in frame.dt]
        frame.first_load_date = [date.fromisoformat(d) for d in frame.first_load_date]
        frame.most_recent_load_date = [date.fromisoformat(d) for d in frame.most_recent_load_date]
        frame.ts_signup = [datetime.datetime.strptime(d, '%Y-%m-%d %H:%M:%S+00:00') for d in frame.ts_signup]
        frame.ts_signup = [date(year=d.year, month=d.month, day=d.day) for d in frame.ts_signup]
    return frame

In [3]:
def convert_date_to_int(dataframe, columns):
    frame = dataframe.copy()
    conversion = lambda dt_time: 10000*dt_time.year + 100*dt_time.month + dt_time.day
    
    for col in columns:
        frame[f'{col}'] = [conversion(d) for d in frame[f'{col}']]

    return frame

In [4]:
def aggregate_rows_by_driver_id(dataframe):
    '''
    Returns a new dataframe indexed by Driver ID using an optimal set of aggregations per column.
    This method will return a dataframe that only contains the columns listed below.
    Note: duplicate columns are inherently pruned (simply by not adding them below), so it is 
        safe to pass in the full dataframe and expect a pruned version in return. 
        Also note that the `id_driver` will now be the index of the dataframe and NOT its own column.
    '''
    minimum = 'min'
    maximum = 'max'
    median = 'median'
    random_mode = lambda x: random.choice(pd.Series.mode(x if isinstance(x, list) else list(x)))
    average = lambda x: pd.Series.mean(x)

    aggregation = {
        'dt': maximum,
        'weekday': random_mode,
        'dim_carrier_type': random_mode,
        'carrier_trucks': mode,
        'num_trucks': maximum,
        'interested_in_drayage': random_mode,
        'port_qualified': random_mode,
        'signup_source': random_mode,
        'ts_signup': maximum,
        'days_signup_to_approval': maximum,
        'driver_with_twic': mode,
        'dim_preferred_lanes': mode,
        'first_load_date': minimum,
        'loads': random_mode,
        'marketplace_loads_otr': maximum,
        'marketplace_loads_atlas': maximum,
        'marketplace_loads': maximum,
        'brokerage_loads_otr': maximum,
        'brokerage_loads_atlas': maximum,
        'brokerage_loads': maximum,
        'label': random_mode
    }

    return dataframe.groupby(['id_driver']).agg(aggregation)

In [5]:
def augment_boolean_columns(dataframe):
    frame = dataframe.copy()
    label_encoder = LabelEncoder()
    
    # Replacement (no new columns needed, just transform strings to 0 or 1)
    frame['interested_in_drayage'] = label_encoder.fit_transform(frame.interested_in_drayage)
    frame['port_qualified'] = label_encoder.fit_transform(frame.port_qualified)
    frame['driver_with_twic'] = label_encoder.fit_transform(frame.driver_with_twic)
    
    # Create new columns with more appropriate names, delete the old columns
    frame['self_owned'] = label_encoder.fit_transform(frame.dim_carrier_type)
    frame['mobile_signup'] = np.logical_xor(label_encoder.fit_transform(frame.signup_source), 1).astype(int)
    frame['has_route_preference'] = label_encoder.fit_transform(~frame.dim_preferred_lanes.isnull())
    frame.drop(columns=['dim_carrier_type', 'signup_source', 'dim_preferred_lanes'], inplace=True)
    
    return frame

In [6]:
def encode_categorical_columns(dataframe):
    frame = dataframe.copy()

    # Encode various truck types
    trucks = pd.get_dummies(frame.carrier_trucks)
    trucks.columns = ['truck-' + c.replace('[', '').replace(']', '').replace('"', '').replace(',', '').replace(' ', '-') for c in trucks.columns]
    
    # Encode the 7 different days of the week
    weekdays = pd.get_dummies(frame.weekday)
    weekdays.columns = [f'prefers-{x}'.lower() for x in weekdays.columns]
    
    # Drop the originals since they are no longer needed.
    frame.drop(columns=['carrier_trucks', 'weekday'], inplace=True)
    
    # Concatenate various new frames with the original and return
    result = pd.concat([frame, trucks, weekdays], axis=1)
    return result

In [7]:
def standardize_numericals(dataframe, columns=None):    
    scaler = StandardScaler()
    frame = None
    
    if columns:
        frame = dataframe[columns].copy()
    else:
        frame = dataframe.copy()
    
    x = scaler.fit_transform(frame)
    scaled_df = pd.DataFrame(x)
    scaled_df.columns = frame.columns

    
    if columns:
        frame = dataframe.copy()
        frame[columns] = scaled_df.values
    else:
        frame = scaled_df.copy()
    
    return frame

In [8]:
def impute_numericals(dataframe, columns, strategy='most_frequent', testset=False):
    frame = dataframe.copy()
    imp = SimpleImputer(missing_values=np.nan, strategy=strategy)
    
    for col in columns:
        imp.fit(frame[f'{col}'].values.reshape(-1, 1))
        frame[f'{col}'] = imp.transform(frame[f'{col}'].values.reshape(-1, 1))
    
    return frame

In [9]:
def generate_labels(dataframe, loads_percentile, most_recent_percentile):
    label_encoder = LabelEncoder()
    frame = dataframe.copy()
    
    labels = (frame.total_loads >= loads_percentile) & (frame.most_recent_load_date >= most_recent_percentile)
    frame['label'] = label_encoder.fit_transform(labels)
    frame.drop(columns=['total_loads', 'most_recent_load_date'], inplace=True)
    
    return frame

In [10]:
def get_Xy(dataframe, aggregate=False, standardize=False, testdata=False):
    frame = dataframe.copy()
    scaler = StandardScaler()
    loads_75th_percentile = 17
    most_recent_75th_percentile = date(year=2021, month=2, day=10)


    if testdata:
        frame = convert_strings_to_dates(frame, testset=True)
        frame = augment_boolean_columns(frame)
        frame = encode_categorical_columns(frame)
        frame = convert_date_to_int(frame, columns=['dt', 'ts_signup', 'first_load_date'])

        # The only columns not in the new frame should be booleans that are safely set to 0
        for col in dataframe.columns:
            if col not in frame.columns:
                frame[f'{col}'] = 0

        frame.drop(columns=['weekday'], inplace=True)
        frame = impute_numericals(frame, ['days_signup_to_approval'])

        if standardize:
            frame = standardize_numericals(frame)

        frame.sort_index(axis=1, inplace=True)
        return frame, None
    
    
    frame = convert_strings_to_dates(frame)
    
    if aggregate:
        frame = generate_labels(frame, loads_75th_percentile, most_recent_75th_percentile)
        frame = aggregate_rows_by_driver_id(frame)
        frame = augment_boolean_columns(frame)
        frame = encode_categorical_columns(frame)
        frame = convert_date_to_int(frame, columns=['dt', 'ts_signup', 'first_load_date'])

        labels = frame.label.copy()
        frame.drop(columns=['label'], inplace=True)
        
        if standardize:
            frame = standardize_numericals(frame)

        frame.sort_index(axis=1, inplace=True)
        
        return frame, labels
    else:
        frame = generate_labels(frame, loads_75th_percentile, most_recent_75th_percentile)
        frame = augment_boolean_columns(frame)
        frame = encode_categorical_columns(frame)
        frame = convert_date_to_int(frame, columns=['dt', 'ts_signup', 'first_load_date'])

        labels = frame.label.copy()
        frame.drop(columns=['label'], inplace=True)
        
        if standardize:
            frame = standardize_numericals(frame)

        frame.sort_index(axis=1, inplace=True)
        
        return frame, labels

## Load CSV to DataFrame

In [11]:
train_df = pd.read_csv('training_dataset_V3.csv')
test_df = pd.read_csv('score_V3.csv')

**Drops duplicates and unwanted columns**

In [12]:
train_Ids = train_df['Unnamed: 0']
train_df.drop(columns=['Unnamed: 0', 'load_day', 'ts_first_approved', 'dim_carrier_company_name', 'home_base_city', 'home_base_state', 'id_carrier_number', 'year'], inplace=True)
train_df.dropna(subset=['days_signup_to_approval', 'num_trucks'], inplace=True)

test_Ids = test_df['Unnamed: 0']
test_df.drop(columns=['Unnamed: 0', 'load_day', 'ts_first_approved', 'dim_carrier_company_name', 'home_base_city', 'home_base_state', 'id_carrier_number', 'year'], inplace=True)

---

# 1 - Generate Labels and 4 - Pipeline

In [13]:
# Use this when you want to generate output for Kaggle
X_train, y_train = get_Xy(train_df, standardize=True, aggregate=False, testdata=False)
X_test, _ = get_Xy(test_df, standardize=True, aggregate=False, testdata=True)

In [14]:
# Use this for training/testing your model
X, y = get_Xy(train_df, standardize=True, aggregate=False, testdata=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [15]:
X_train.shape, y_train.shape

((56857, 40), (56857,))

In [16]:
X_test.shape, y_test.shape

((14215, 40), (14215,))

# 3 - Statistical Analysis

# 5 - Linear Regression

# 6 - PCA

# 7 - Ensemble Method

# 8 - Neural Network

# 9 - Cross Validation

# 10 - Custom Model

In [17]:
# get training data
X_og, y_og = get_Xy(train_df, standardize=False, aggregate=False, testdata=False)
X_og["label"] = y_og
X_og.head()

Unnamed: 0,brokerage_loads,brokerage_loads_atlas,brokerage_loads_otr,days_signup_to_approval,driver_with_twic,dt,first_load_date,has_route_preference,id_driver,interested_in_drayage,...,truck-dryvan-reefer,truck-flatbed,truck-poweronly,truck-poweronly-boxtruck,truck-poweronly-dryvan,truck-reefer,truck-reefer-dryvan,truck-reefer-poweronly,ts_signup,label
0,45,45,0,14.0,1,20191216,20191024,0,21350,1,...,0,0,1,0,0,0,0,0,20190408,1
1,1,1,0,18.0,0,20210115,20201014,0,36437,1,...,0,0,0,1,0,0,0,0,20200418,0
2,2,2,0,156.0,1,20191226,20191212,0,19323,1,...,0,0,1,0,0,0,0,0,20190211,0
3,0,0,0,0.0,1,20210210,20200618,0,34809,1,...,0,0,0,0,1,0,0,0,20190304,1
4,314,0,314,794.0,0,20170724,20170117,0,4728,0,...,0,0,0,0,0,0,0,0,20170117,0


In [18]:
X_og[X_og["id_driver"] == 21350].label.iloc[0]

1

In [19]:
# leverages the fact that all high performing truckers have multiple rows
class dumb_model():
    def __init__(self):
        self.data = None
    def fit(self, X, y = None):
        self.data = X
    def predict(self, driver_ids):
        preds = []
        for d_id in driver_ids:
            # if the d_id is in the X_og data, then return the label
            if len(self.data[self.data["id_driver"] == d_id]) > 0:
                lab = self.data[self.data["id_driver"] == d_id].label.iloc[0]
                preds.append(lab)
            # else, statistically very unlikely that label is 1
            else:
                preds.append(0)
        return np.array(preds)
    def score(self, driver_ids, labels, normalized = True):
        
        # get predictions
        preds = self.predict(driver_ids)
        lab_arr = np.array(labels)
        diffs = np.sum(preds == lab_arr)
        
        # return score
        if normalized:
            return diffs / len(preds)
        else:
            return diffs

In [20]:
from sklearn.model_selection import KFold
n_splits = len(X_og) // 1000
kf = KFold(n_splits=n_splits, shuffle=True)

In [21]:
# perform cross validation
test_scores = []
counter = 0
for train_idx, test_idx in kf.split(X_og):
    dm = dumb_model()
    dm.fit(X_og.iloc[train_idx])
    score = dm.score(X_og.iloc[test_idx]["id_driver"], X_og.iloc[test_idx]["label"])
    test_scores.append(score)
    if counter % 10 == 0:
        print(score)
    counter += 1
    

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [47]:
# get kaggle predictions
X_test = pd.read_csv("score_V3.csv")
dm = dumb_model()
dm.fit(X_og)
preds = dm.predict(X_test["id_driver"])
np.sum(preds)

262

In [48]:
preds_df = pd.DataFrame()
preds_df["ID"] = X_test["Unnamed: 0"]
preds_df["predicted"] = preds
preds_df.to_csv("submission.csv", index=False)

In [51]:
preds_df = pd.DataFrame()
preds_df["ID"] = X_test["Unnamed: 0"]
preds_df["predicted"] = 0
preds_df.to_csv("submission.csv", index=False)

In [55]:
# my attempt to make labels
np.sum(np.array(y_train))

18832

In [59]:
train_dataset = pd.read_csv("training_dataset_V3.csv")

In [67]:
train_dataset["d_numeric"] = pd.to_numeric(train_dataset.most_recent_load_date.str.replace('-', ''))
train_dataset["recent_enough"] = train_dataset["d_numeric"] > 20210210
train_dataset

Unnamed: 0.1,Unnamed: 0,dt,weekday,year,id_driver,id_carrier_number,dim_carrier_type,dim_carrier_company_name,home_base_city,home_base_state,...,loads,marketplace_loads_otr,marketplace_loads_atlas,marketplace_loads,brokerage_loads_otr,brokerage_loads_atlas,brokerage_loads,total_loads,d_numeric,recent_enough
0,0,2019-12-16,Monday,2019,21350,U0109015,Owner Operator,CA&F TRUCKING,Maywood,CA,...,2,0,438,438,0,45,45,483,20210217,True
1,1,2021-01-15,Friday,2021,36437,C0097727,Fleet,New opportunities inc,Los Angeles,CA,...,1,2,72,74,0,1,1,75,20210203,False
2,2,2019-12-26,Thursday,2019,19323,U0107081,Owner Operator,RAS,Compton,CA,...,1,0,180,180,0,2,2,182,20200925,False
3,3,2021-02-10,Wednesday,2021,34809,C0094651,Fleet,NFS asset Drayage,Lynwood,CA,...,3,0,0,0,0,0,0,62,20210217,True
4,4,2017-07-24,Monday,2017,4728,U0094376,Owner Operator,joes transportation,Norco,CA,...,2,57,0,57,314,0,314,371,20171011,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83409,83409,2019-05-24,Friday,2019,7879,C0090913,Fleet,the custom companies inc,Northlake,IL,...,1,0,0,0,346,0,346,346,20210214,True
83410,83410,2017-08-08,Tuesday,2017,619,C0090094,Fleet,MC Express Trucking LLC,Rosarito,B.C.,...,1,0,0,0,44,0,44,44,20180206,False
83411,83411,2020-09-04,Friday,2020,15666,U0103746,Owner Operator,iraheta logistics,Moreno Valley,CA,...,5,14,691,705,0,52,52,757,20210129,False
83412,83412,2017-02-14,Tuesday,2017,4728,U0094376,Owner Operator,joes transportation,Norco,CA,...,3,57,0,57,314,0,314,371,20171011,False


In [78]:
train_dataset["enough_loads"] = train_dataset["total_loads"] > train_dataset["total_loads"].quantile(0.75)
train_dataset["predicted"] = pd.to_numeric((train_dataset["enough_loads"]) & (train_dataset["recent_enough"]))
np.sum(np.array(train_dataset["predicted"])) / len(train_dataset)

0.13961685088834008

In [79]:
dm = dumb_model()
X_train = pd.DataFrame()
X_train["id_driver"] = train_dataset["id_driver"]
X_train["label"] = train_dataset["predicted"]
dm.fit(X_train)
preds = dm.predict(X_test["id_driver"])
np.sum(preds)
preds_df = pd.DataFrame()
preds_df["ID"] = X_test["Unnamed: 0"]
preds_df["predicted"] = preds
preds_df.to_csv("submission.csv", index=False)