In [1]:
import json
import pandas as pd
from pandas.io.json import json_normalize #package for flattening json in pandas df
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
with open('C:\\Users\\chq-shal\\Downloads\\projects_python\\project_workboard\\small3.json',encoding='utf-8') as d:
    f=json.load(d)
data=json_normalize(f)

In [7]:
list(data)

['beginTime', 'endTime', 'payload', 'scenarioBit', 'visibility']

In [11]:
data.endTime.isnull().value_counts() # true is not compliant, false might be compliant or not

False    6009
True     3991
Name: endTime, dtype: int64

In [2]:
data.scenarioBit.value_counts()

1        4304
16384    4041
8        1609
16393      46
Name: scenarioBit, dtype: int64

In [3]:
begin=pd.to_datetime(data.beginTime, utc=True)
end=pd.to_datetime(data.endTime, utc=True)
compliant=end-begin<np.timedelta64(2,'h') # true is compliant, false is not compliant

In [4]:
compliant.value_counts()

False    7798
True     2202
dtype: int64

In [12]:
payload=data['payload'].apply(lambda row:pd.Series(json.loads(row))) # convert payload dictionaries to columns
visibility=data['visibility'].apply(lambda row:pd.Series(json.loads(row))) # convert visibility dictionaries to columns

In [24]:
payload.columns

Index(['aggregateId', 'billToId', 'businessKey', 'branch', 'consignId',
       'curStatus', 'dateCreate', 'dateRcvdCfit', 'dsptchCarrierId',
       'etmsBranch', 'events', 'fcmFlag', 'fileType', 'holds', 'onHandRef',
       'linkedShipment', 'pcsAct', 'pcsActType', 'pickupDeliverFlag',
       'portOrigin', 'portDestin', 'revenueId', 'serviceLevel', 'serviceType',
       'shipmentType', 'shipperId', 'shpmntRef', 'streamBit', 'typeOfGoods',
       'vendorId', 'wgtAct', 'wgtDeclAct', 'workBranch', 'mode', 'brokerage',
       'ourPaper', 'deliveryId', 'delivery', 'kill', 'perspective',
       'dateBackby', 'portLoading', 'linkedShippingOrder', 'shipordRef',
       'commodityDesc', 'rstrctCommItarFlag', 'exportClearance', 'hazrdSw',
       'shipperName'],
      dtype='object')

In [6]:
events=pd.DataFrame(payload.events[0]) # is a json object

In [18]:
visibility.columns # milestone codes are json objects

Index(['primaryId', 'level', 'billToId', 'businessKey', 'consignId',
       'dsptchCarrierId', 'fileType', 'pickupDeliverFlag', 'portDestin',
       'portOrigin', 'revenueId', 'serviceLevel', 'serviceType',
       'shipmentType', 'shipperId', 'shpmntRef', 'typeOfGoods', 'vendorId',
       'mode', 'na', 'tc', 'cutoffPrep', 'devn', 'documentsSatisfied', 'dsn',
       'els', 'firstAware', 'frd', 'freightReceived', 'phd', 'sir',
       'startPrep', 'dateBackby', 'cdp', 'cpu', 'dra', 'drr', 'sli', 'test',
       'osd', 'commodityDesc', 'rstrctCommItarFlag', 'blpHb', 'puo', 'lkp',
       'smp', 'ecs', 'evr', 'ntc', 'cln', 'exportClearance', 'afs', 'sfs',
       'ecc', 'cek', 'hot', 'aue', 'frf', 'ega', 'shipperName', 'rud', 'ddn'],
      dtype='object')

In [65]:
first_aware=pd.DataFrame(list(visibility.firstAware)) # is a json object

In [25]:
df=pd.concat([begin,payload,visibility],axis=1)
df=df.dropna(axis=1) # drop columns that have na values in them
df=df.drop(['events','firstAware','etmsBranch','workBranch'],axis=1) # drop dict events, firstAware; drop dup etms, work
df=df.loc[:,~df.columns.duplicated()] # drop dup columns

In [26]:
df['dateDT']=pd.to_datetime(df.dateCreate, utc=True)
df['diffS']=(df.dateDT-df.beginTime).dt.total_seconds()
df['beginY']=df.beginTime.dt.dayofyear
df['beginW']=df.beginTime.dt.dayofweek
df['beginH']=df.beginTime.dt.hour
df['dateY']=df.dateDT.dt.dayofyear
df['dateW']=df.dateDT.dt.dayofweek
df['dateH']=df.dateDT.dt.hour

In [27]:
too_sim=0.0003*len(df) # very similar values in a column
too_dif=0.5*len(df) # very different values in a column
for col in df.columns: # remove columns that have values that are too similar or too different
    if df[col].nunique()<too_sim or df[col].nunique()>too_dif:
        df.drop(col,inplace=True,axis=1)

In [30]:
df[['branch','curStatus','portOrigin','shipmentType','mode','brokerage','kill']]=df[['branch','curStatus','portOrigin','shipmentType','mode','brokerage','kill']].astype('category')
df.dtypes

branch          category
curStatus       category
portOrigin      category
shipmentType    category
mode            category
brokerage       category
kill            category
diffS            float64
beginY             int64
beginW             int64
beginH             int64
dateY              int64
dateW              int64
dateH              int64
dtype: object

In [31]:
X_train,X_test,y_train,y_test=train_test_split(pd.get_dummies(df),compliant,random_state=0)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [32]:
from sklearn.ensemble import GradientBoostingClassifier
# clf=GradientBoostingClassifier(max_depth=2,random_state=0)
# grid_values = {'learning_rate': [0.001, 0.01, 0.05, 0.1, 1, 10]}
# grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')
# grid_clf_auc.fit(X_train_scaled, y_train)
# y_decision_fn_scores_auc = grid_clf_auc.decision_function(X_test_scaled)
# print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc))
# print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
# print('Grid best score (AUC): ', grid_clf_auc.best_score_)
clf=GradientBoostingClassifier(learning_rate=1,max_depth=2,random_state=0) # build bradient boost model with learning rate=1
gb_clf=clf.fit(X_train_scaled, y_train)
gb_y_score=gb_clf.decision_function(X_test_scaled)
gb_fpr, gb_tpr, _ = roc_curve(y_test, gb_y_score)
gb_roc_auc = auc(gb_fpr, gb_tpr)
gb_roc_auc

0.9305341981969364

In [34]:
feature=gb_clf.feature_importances_
pd.Series(data=feature,index=list(X_train)).nlargest(5)

mode_4            0.352646
branch_HKG        0.112139
diffS             0.096293
portOrigin_SFO    0.041666
beginH            0.039340
dtype: float64

In [35]:
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier()
# grid_values = {'n_estimators': [200, 700],'max_features': ['auto', 'sqrt', 'log2']}
# grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')
# grid_clf_auc.fit(X_train_scaled, y_train)
# y_decision_fn_scores_auc = grid_clf_auc.predict_proba(X_test_scaled)[:,1]
# print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc))
# print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
# print('Grid best score (AUC): ', grid_clf_auc.best_score_)
clf=RandomForestClassifier(max_features='log2', n_estimators=700)
rf_clf=clf.fit(X_train_scaled, y_train)
rf_y_score=rf_clf.predict_proba(X_test_scaled)[:,1]
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_y_score)
rf_roc_auc = auc(rf_fpr, rf_tpr)
rf_roc_auc

0.9566099525453268

In [36]:
feature=rf_clf.feature_importances_
pd.Series(data=feature,index=list(X_train)).nlargest(5)

diffS               0.113535
beginH              0.094573
dateH               0.084722
shipmentType_OHA    0.048629
beginY              0.047920
dtype: float64

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# clf = KNeighborsClassifier(n_jobs=-1)
# grid_values = {'n_neighbors': list(range(1,11))}
# grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')
# grid_clf_auc.fit(X_train_scaled, y_train)
# y_decision_fn_scores_auc = grid_clf_auc.predict_proba(X_test_scaled)[:,1]
# print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc))
# print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
# print('Grid best score (AUC): ', grid_clf_auc.best_score_)
clf=KNeighborsClassifier(n_neighbors=7,n_jobs=-1) # build knn model with neighbors=7
knn_clf=clf.fit(X_train_scaled, y_train)
knn_y_score=knn_clf.predict_proba(X_test_scaled)[:,1]
knn_fpr, knn_tpr, _ = roc_curve(y_test, knn_y_score)
knn_roc_auc = auc(knn_fpr, knn_tpr)
knn_roc_auc

In [14]:
from sklearn.linear_model import LogisticRegression
# clf = LogisticRegression(n_jobs=-1)
# grid_values = {'C': [1,10,100,1000]}
# grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')
# grid_clf_auc.fit(X_train_scaled, y_train)
# y_decision_fn_scores_auc = grid_clf_auc.decision_function(X_test_scaled)
# print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc))
# print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
# print('Grid best score (AUC): ', grid_clf_auc.best_score_)
clf=LogisticRegression(C=1,n_jobs=-1) # build logistic regression model with C=1
lr_clf=clf.fit(X_train_scaled, y_train)
lr_y_score=lr_clf.predict_proba(X_test_scaled)[:,1]
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_y_score)
lr_roc_auc = auc(lr_fpr, lr_tpr)
lr_roc_auc

  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.8841912426238085

In [18]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
dt_clf=clf.fit(X_train_scaled, y_train)
dt_y_score=dt_clf.predict_proba(X_test_scaled)[:,1]
dt_fpr, dt_tpr, _ = roc_curve(y_test, dt_y_score)
dt_roc_auc = auc(dt_fpr, dt_tpr)
dt_roc_auc

0.86274229457301