In [None]:
### Imports ###
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import json
import gzip
import sys
sys.path.append("lib")
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from keras.models import model_from_json
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU, PReLU
from sklearn.decomposition import PCA
from scipy import cluster
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist
from scipy.stats import chi2_contingency, pearsonr
from itertools import combinations
from sklearn import decomposition
from sklearn.manifold import TSNE
from ggplot import *
from sklearn.manifold import TSNE

In [None]:
# Read the Data
df_train = pd.read_csv('../input/alldata/DATA.csv', sep=';')
df_train.shape

In [None]:
df_train.head(15)

In [None]:
df_train.isnull().sum()

In [None]:
# Drop Missing Value Rows
df_train.dropna(subset=["Time in/out (hours)"], inplace=True)

# Fill in Missing Values
df_train['Discharged'] = df_train['Discharged'].fillna(df_train['Date of Arrival'])
df_train["CommunityCode"] = df_train["CommunityCode"].replace(np.nan, "NONE")
df_train["Load Location"] = df_train["Load Location"].replace(np.nan, "NONE")
df_train['Loaded'] = df_train['Loaded'].fillna(df_train['Date of Departure (Scheduled)'])

In [None]:
df_train.isnull().sum()

In [None]:
# Drop Unused columns
df_train = df_train.drop(columns = ['Date of Departure (Scheduled)','Date of Arrival','Voyage Number'])

In [None]:
# Convert Date columns into DataTime variable
df_train['Discharged'] = pd.to_datetime(df_train['Discharged'])
df_train['Loaded'] = pd.to_datetime(df_train['Loaded'])

In [None]:
df_train.head()

In [None]:
# Seperate DateTime variables into parts
index_no = df_train.columns.get_loc('Discharged')
df_train.insert(index_no, 'Discharged: Year', pd.DatetimeIndex(df_train['Discharged']).year, True)
df_train.insert(index_no + 1, 'Discharged: Month', pd.DatetimeIndex(df_train['Discharged']).month, True)
df_train.insert(index_no + 2, 'Discharged: Day', pd.DatetimeIndex(df_train['Discharged']).day, True)
df_train.insert(index_no + 3, 'Discharged: Weekday',pd.DatetimeIndex(df_train['Discharged']).day_name(), True)

index_no = df_train.columns.get_loc('Loaded')
df_train.insert(index_no, 'Loaded: Year', pd.DatetimeIndex(df_train['Loaded']).year, True)
df_train.insert(index_no + 1, 'Loaded: Month', pd.DatetimeIndex(df_train['Loaded']).month, True)
df_train.insert(index_no + 2, 'Loaded: Day', pd.DatetimeIndex(df_train['Loaded']).day, True)
df_train.insert(index_no + 3, 'Loaded: Weekday',pd.DatetimeIndex(df_train['Loaded']).day_name(), True)

#Drop Unused Columns
df_train = df_train.drop(columns = ['Discharged','Loaded'])

df_train.head()

In [None]:
df_train['Total Weight'] = 1000 * df_train['Total Weight']

# Convert Time and Weight into int type for optimization purposes
df_train['Time in/out (hours)'] = df_train['Time in/out (hours)'].astype(int)
df_train['Total Weight'] =df_train['Total Weight'].astype(int)

df_train.head()

In [None]:
isRare = pd.DataFrame(df_train['Delivery Location'].value_counts())
isRare.head()

In [None]:
list_rare = isRare[isRare['Delivery Location']<3].index.unique().tolist()
len(list_rare)

In [None]:
for index, row in df_train.iterrows():
    if df_train.loc[index, 'Delivery Location'] in list_rare:
        df_train.loc[index, 'Delivery Location'] = "isRare"

In [None]:
isRare = pd.DataFrame(df_train['Load Location'].value_counts())
isRare.head()

In [None]:
list_rare = isRare[isRare['Load Location']<3].index.unique().tolist()
len(list_rare)

In [None]:
for index, row in df_train.iterrows():
    if df_train.loc[index, 'Load Location'] in list_rare:
        df_train.loc[index, 'Load Location'] = "isRare"

In [None]:
def convertWeight(y):
    if y <= 5000:
        return 'A(weight<5000)'
    elif 5000 < y <= 10000:
        return 'B(5000<weight<10000)'
    elif 10000 < y <= 15000:
        return 'C(10000<weight<15000)'
    elif 15000 < y <= 20000:
        return 'D(15000<weight<20000)'
    elif 20000 < y <= 25000:
        return 'E(20000<weight<25000)'
    else:
        return 'F(weight>25000)'

df_train['Total Weight'] = df_train['Total Weight'].apply(convertWeight)
df_train.head()

In [None]:
# Check if the Unit Number contains non-unique entries
df_train['Unit Number'].nunique()

In [None]:
# Check if the unit already passed the through Trieste or not
non_unique = pd.DataFrame(df_train['Unit Number'].value_counts())
non_unique = non_unique[non_unique['Unit Number'] > 1]
non_unique = non_unique.index.tolist()


for index, row in df_train.iterrows():
    if df_train.loc[index, 'Unit Number'] in non_unique:
        df_train.loc[index, 'AlreadyPassed'] = 1
    else:
        df_train.loc[index, 'AlreadyPassed'] = 0
        
df_train = df_train.drop(['Unit Number'], axis=1)

df_train.head()

In [None]:
df = df_train['Time in/out (hours)']
df_train = df_train.drop(columns = ['Time in/out (hours)'])

from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
# Encode the data
encoder_dict = defaultdict(LabelEncoder)
df_train = df_train.apply(lambda x: encoder_dict[x.name].fit_transform(x))
df_train.head()

In [None]:
df_train.insert(20,'Dwell Time',df)
df_train.head()

In [None]:
df_train.corr()

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax = sns.heatmap(df_train.corr(), xticklabels=True, yticklabels=True)

In [None]:
plt.figure(figsize=(10,10))

name = "Random Forest"

rf = RandomForestRegressor(criterion='mae')
rf.fit(df_train.drop(['Dwell Time'], axis=1),df_train['Dwell Time'])

indices = np.argsort(rf.feature_importances_)[::-1][:40]
g = sns.barplot(y=df_train.drop(['Dwell Time'], axis=1).columns[indices][:40],x = rf.feature_importances_[indices][:40] , orient='h')
g.set_xlabel("Relative importance",fontsize=12)
g.set_ylabel("Features",fontsize=12)

g.tick_params(labelsize=9)
g.set_title("Feature importance")

In [None]:
def myPCA(df, clusters=None):
    # Normalize data
    df_norm = (df - (df.mean())) / (df.std())
    # PCA
    pca = PCA(n_components=len(df.columns))
    global pca_res
    pca_res = pca.fit_transform(df_norm.values)
    print("Variance explained : " + str(pca.explained_variance_ratio_.sum()))
    # Ebouli
    plt.figure(figsize=(12,6))
    ebouli = pd.Series(pca.explained_variance_ratio_)
    ebouli.plot(kind='line', title="Scree Plot of the Eigen Values")
    plt.show()
    
    # Circle of correlations
    coef = np.transpose(pca.components_)
    cols = ['PC-'+str(x) for x in range(len(ebouli))]
    pc_infos = pd.DataFrame(coef, columns=cols, index=df_norm.columns)
    plt.figure(figsize=(12,12))
    circleOfCorrelations(pc_infos, ebouli)
    plt.show()
    

    return pc_infos, ebouli

def circleOfCorrelations(pc_infos, ebouli):
    plt.Circle((0,0),radius=10, color='g', fill=False)
    circle1=plt.Circle((0,0),radius=1, color='b', fill=False)
    circle2=plt.Circle((0,0),radius=0.5, color='b', fill=False)
    fig = plt.gcf()
    fig.gca().add_artist(circle1)
    fig.gca().add_artist(circle2)
    for idx in range(len(pc_infos["PC-0"])):
        x = pc_infos["PC-0"][idx]
        y = pc_infos["PC-1"][idx]
        plt.plot([0.0,x],[0.0,y],'k-')
        plt.plot(x, y, 'rx')
        plt.annotate(pc_infos.index[idx], xy=(x-0.05,y-0.05))
    plt.xlabel("PC-0 (%s%%)" % str(ebouli[0])[:4].lstrip("0."))
    plt.ylabel("PC-1 (%s%%)" % str(ebouli[1])[:4].lstrip("0."))
    plt.xlim((-1,1))
    plt.ylim((-1,1))
    plt.title("Circle of Correlations")

In [None]:
pc_infos, ebouli = myPCA(df_train)

In [None]:
# Split the Data
X = df_train.drop(['Dwell Time'], axis=1)
y = df_train['Dwell Time']

In [None]:
# Label Encoded
X_train_label, X_test_label, y_train_label, y_test_label = train_test_split(X, y, test_size=0.10, random_state=42)

# OneHotEncoded
enc = OneHotEncoder(handle_unknown='ignore')
X_oh = enc.fit_transform(X)

X_train_oh, X_test_oh, y_train_oh, y_test_oh = train_test_split(X_oh, y, test_size=0.10, random_state=42)

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
neigh = KNeighborsRegressor(n_neighbors=3)
neigh.fit(X_train_oh, y_train_oh) 
y_pred_oh = neigh.predict(X_test_oh)
mean_absolute_error(y_test_oh, y_pred_oh)

In [None]:
# Label Encoded Random Forest
rf = RandomForestRegressor(n_estimators=300, max_depth = 20, random_state = 42, n_jobs=-1)
rf.fit(X_train_label, y_train_label)

y_pred_label = rf.predict(X_test_label)

mean_absolute_error(y_test_label, y_pred_label)

In [None]:
# One Hot Encoded Random Forest
rf = RandomForestRegressor(n_estimators=300, max_depth = 20, random_state = 42, n_jobs=-1)
rf.fit(X_train_oh, y_train_oh)
y_pred_oh = rf.predict(X_test_oh)
mean_absolute_error(y_test_oh, y_pred_oh)

In [None]:
# Gradient Boosting
reg_gbm = xgb.XGBRegressor(learning_rate = 0.05, n_estimators=500, max_depth=20)
reg_gbm.fit(X_train_oh, y_train_oh)
y_pred_oh = reg_gbm.predict(X_test_oh)
mean_absolute_error(y_test_oh, y_pred_oh)

In [None]:
# Dimension Reduction
pca = PCA(n_components=100)
pca_res = pca.fit_transform(X_oh.toarray())
pca.explained_variance_ratio_.sum()

In [None]:
pca_cum = []
total = 0
for val in pca.explained_variance_ratio_ :
    total = total + val
    pca_cum.append(total)
    
plt.figure(figsize=(12,6))
plt.plot(pca_cum)
plt.title("Cumulative variance explained by the decomposition into principal components")
plt.show()

In [None]:
pca = PCA(n_components=60)
pca_res = pca.fit_transform(X_oh.toarray())

X = X_oh.toarray()
y = df_train['Dwell Time']

In [None]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X, y, test_size=0.10, random_state=42)

In [None]:
rf = RandomForestRegressor(n_estimators=300, max_depth = 20, random_state = 42, n_jobs=-1)
rf.fit(X_train_pca, y_train_pca)

In [None]:
y_pred_pca = rf.predict(X_test_pca)
mean_absolute_error(y_test_pca, y_pred_pca)

In [None]:
X_train_oh.shape

In [None]:
model=Sequential()

model.add(Dense(100, input_dim=545, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.20))
model.add(BatchNormalization())

model.add(Dense(100, input_dim=100, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.20))
model.add(BatchNormalization())

model.add(Dense(100, input_dim=100, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.20))
model.add(BatchNormalization())

model.add(Dense(100, input_dim=100, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.20))
model.add(BatchNormalization())

model.add(Dense(1, input_dim=100, kernel_initializer='normal', activation='relu'))

#model.add(Dense(1, kernel_initializer='normal'))
#sgd = SGD(lr=0.1, momentum=0.8)
model.compile(loss='mean_absolute_percentage_error', optimizer='adam')

In [None]:
model.fit(X_train_oh.toarray(), y_train_oh, epochs=50, batch_size=32, validation_data = (X_test_oh.toarray(),y_test_oh),verbose=1)

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

In [None]:
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

In [None]:
prediction = loaded_model.predict(X_test_oh)
mean_absolute_error(y_test_oh, prediction.flatten())

In [None]:
# ANN Prediction
df_new = pd.DataFrame(y_test_oh,columns = ['Dwell Time'])
df_new = df_new.reset_index(drop=True)
data = pd.DataFrame(np.round(prediction))
df_new.insert(1,'Predicted',data)
df_new['Predicted'] = df_new['Predicted'].astype(int)
df_new

In [None]:
# XGBoost Prediction
df_xg = pd.DataFrame(y_test_oh,columns = ['Dwell Time'])
df_xg = df_xg.reset_index(drop=True)
data = pd.DataFrame(np.round(y_pred_oh))
df_xg.insert(1,'Predicted',data)
df_xg['Predicted'] = df_xg['Predicted'].astype(int)
df_xg

In [None]:
def convert(x):
    if x <= 24:
        return 'within 24 Hours'
    elif 24 < x <= 48:
        return 'Between 24 and 48 Hours'
    elif 48 < x <= 72:
         return 'Between 48 and 72 Hours'
    elif 72 < x <= 96:
        return 'Between 72 and 96 Hours'
    else:
        return 'More than 96 Hours'
    
df_new['Dwell Time'] = df_new['Dwell Time'].apply(convert)
df_new['Predicted'] = df_new['Predicted'].apply(convert)

df_xg['Dwell Time'] = df_xg['Dwell Time'].apply(convert)
df_xg['Predicted'] = df_xg['Predicted'].apply(convert)

In [None]:
df_new.to_excel('Ann Prediction.xlsx',index = False)

In [None]:
df_xg.to_excel('XGBoost Prediction.xlsx',index = False)