In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import ADASYN
from sklearn.svm import SVC
from sklearn import preprocessing
import copy 
from imblearn.over_sampling import SMOTE 
from datetime import datetime
import datetime
from sklearn import metrics
from pathlib import Path
import seaborn as sns

In [None]:
path = Path('DataSet')

df_train = pd.read_csv(path/'Train.csv', low_memory=False)
df_test = pd.read_csv(path/'Test.csv', low_memory=False)
test_id_code = df_test.id_code

print(df_train.shape)
print(df_test.shape)
df_train.head()

In [None]:
Target_Volume = {"high": 2, "medium": 1, "low": 0 }
Predict_Volume = {2: "high", 1: "medium", 0: "low" }
is_weekend = {"False": 0 , "True": 1}

def ChangeTime(x):
    if x != "":
        dt = datetime.datetime.strptime(x, "%I:%M:%S %p")
        seconds = dt.second + dt.minute * 60 + dt.hour * 60 * 60
        #print(seconds)
        return int(seconds)
    else:
        return ""

def ConvertStations(x):
    if x != "":
        x = x.replace("station$", "")
        return int(x)
    else:
        return ""

def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_station_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def Prepare(dataset):
    dataset["current_time"] = dataset["current_time"].apply(ChangeTime)
    dataset["source_name"] = dataset["source_name"].apply(ConvertStations)
    dataset["destination_name"] = dataset["destination_name"].apply(ConvertStations)
    dataset["is_weekend"] = dataset["is_weekend"].apply(lambda x: is_weekend[str(x)])
    dataset = dataset.sort_values(by=['current_date', 'current_time'])
    return dataset

In [None]:
df_train = Prepare(df_train)
df_test = Prepare(df_test)
df_train["target"] = df_train["target"].apply(lambda x: Target_Volume[x])    

In [None]:
def calculate_empty(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [None]:
calculate_empty(df_train).head()

<h3>A. Haversine Distance Between the Two Lat/Lons:</h3>

In [None]:
def fill_all_empty(dataset):
    dataset["longitude_destination"] = dataset["longitude_destination"].fillna(0)
    dataset["latitude_destination"] = dataset["latitude_destination"].fillna(0)
    dataset["longitude_source"] = dataset["longitude_source"].fillna(0)
    dataset["latitude_source"] = dataset["latitude_source"].fillna(0)
    dataset["mean_halt_times_destination"] = dataset["mean_halt_times_destination"].fillna(0)
    dataset["mean_halt_times_source"] = dataset["mean_halt_times_source"].fillna(0)
    
    dataset["country_code_source"] = dataset["country_code_source"].fillna("None")
    dataset["country_code_destination"] = dataset["country_code_destination"].fillna("None")
    
    dataset["station_diff"] = np.abs(dataset["source_name"] - dataset["destination_name"])
    dataset.loc[:, 'center_latitude'] = (dataset['latitude_source'].values + dataset['latitude_destination'].values) / 2
    dataset.loc[:, 'center_longitude'] = (dataset['longitude_source'].values + dataset['longitude_destination'].values) / 2
    
    # B. Manhattan Distance Between the two Lat/Lons:
    dataset.loc[:, 'dummy_station_distance'] = dummy_station_distance(dataset['latitude_source'].values, dataset['longitude_source'].values, 
                                                     dataset['latitude_destination'].values,  dataset['longitude_destination'].values)
    
    return dataset


df_train = fill_all_empty(df_train)
df_test = fill_all_empty(df_test)

In [None]:
print("Train X", df_train.shape)
print("Test X", df_test.shape)

In [None]:
def create_time_difference(dataset):
    dataset["delay_time"] = 0
    _dates = dataset["current_date"].unique()
    final_data = pd.DataFrame()
    #print(_dates)
    for date in _dates:
        dataset_block = dataset.loc[dataset["current_date"] == date]
        delay_time = dataset_block["current_time"].diff().values
        dataset.loc[dataset["current_date"] == date, "delay_time"] = delay_time
        """
        dataset_count_df = pd.DataFrame({'current_date' : dataset_block["current_date"].values, 
                                       'delay_time': delay_time})
        final_data = pd.concat([final_data, dataset_count_df], ignore_index=True)
        print("final_data X", final_data.shape)
        del dataset_count_df
        """
    dataset["delay_time"] = dataset["delay_time"].fillna(0)
    del final_data
    return dataset

df_train = create_time_difference(df_train)
df_test = create_time_difference(df_test)

In [None]:
print("Train X", df_train.shape)
print("Test X", df_test.shape)
df_train.head()

In [None]:
df_train.head()

<h3>Normalization</h3>

In [None]:
def normalize_data(dataset):
    dataset["current_time"] = StandardScaler().fit_transform(np.array(dataset['current_time']).reshape(-1, 1))
    dataset["source_name"] = StandardScaler().fit_transform(np.array(dataset['source_name']).reshape(-1, 1))
    dataset["destination_name"] = StandardScaler().fit_transform(np.array(dataset['destination_name']).reshape(-1, 1))
    dataset["mean_halt_times_destination"] = StandardScaler().fit_transform(np.array(dataset['mean_halt_times_destination']).reshape(-1, 1))
    dataset["mean_halt_times_source"] = StandardScaler().fit_transform(np.array(dataset['mean_halt_times_source']).reshape(-1, 1))
    dataset["station_diff"] = StandardScaler().fit_transform(np.array(dataset['station_diff']).reshape(-1, 1))
    dataset["dummy_station_distance"] = StandardScaler().fit_transform(np.array(dataset['dummy_station_distance']).reshape(-1, 1))
    dataset["delay_time"] = StandardScaler().fit_transform(np.array(dataset['delay_time']).reshape(-1, 1))
    dataset["center_latitude"] = StandardScaler().fit_transform(np.array(dataset['delay_time']).reshape(-1, 1))
    
    dataset["halt_times_diff"] = np.abs(dataset["mean_halt_times_source"] - dataset["mean_halt_times_destination"])
    return dataset

df_train = normalize_data(df_train)
df_test = normalize_data(df_test)

In [None]:
#df_train.groupby(["current_date"])["train_name"].nunique()
def other_feature_engineering(dataset):
    train_count_df = pd.DataFrame({'total_train_count' : dataset.groupby( [ "current_date"] )["train_name"].count()}).reset_index()
    train_count_df["total_train_count"] = StandardScaler().fit_transform(np.array(train_count_df['total_train_count']).reshape(-1, 1))
    dataset = pd.merge(dataset, train_count_df, on='current_date', how='left')
    del train_count_df
    return dataset

df_train = other_feature_engineering(df_train)
df_test = other_feature_engineering(df_test)

In [None]:
#df_train.to_csv("tempsave.csv")

In [None]:
#df_train["current_time"]

In [None]:
df_train.describe()

In [None]:
sns.distplot(df_train['current_time']);
print("Skewness: %f" % df_train['current_time'].skew())
print("Kurtosis: %f" % df_train['current_time'].kurt())

In [None]:
plt.figure(figsize=(6, 6))
sns.countplot(x="target", data=df_train)

In [None]:
def remove_unwanted_features(dataset):
    dataset = dataset.drop(["id_code", "current_date"], axis=1)
    dataset = dataset.drop(["current_year", "current_week"], axis=1)
    dataset = dataset.drop(["longitude_source", "latitude_source", "longitude_destination", "latitude_destination"], axis=1)
    return dataset

df_train = remove_unwanted_features(df_train)
df_test = remove_unwanted_features(df_test)

<h3>Heat map</h3>

In [None]:
corr = df_train.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
#hello
print("Train X", df_train.shape)
print("Test X", df_test.shape)

In [None]:
mergedata = pd.concat([df_train, df_test], sort=False)
mergedata_pandas = pd.get_dummies(mergedata)

df_train = mergedata_pandas[:df_train.shape[0]]
df_test = mergedata_pandas[df_train.shape[0]:]

y = df_train.target # Target variable
X = df_train.drop(["target"], axis=1)
df_test = df_test.drop(["target"], axis=1)

In [None]:
#sns.pairplot(df_train)
print("Train X", X.shape)
print("Test X", df_test.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25, random_state=30)

<h3>Over Sampling</h3>

In [None]:
sm = SMOTE(random_state = 42) 
X_train, y_train = sm.fit_resample(X_train, y_train) 

#X_train, y_train = ADASYN().fit_resample(X, y)

In [None]:
#sns.pairplot(df_train)
print("Train SMOTE X", X_train.shape)

In [None]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
model = LogisticRegression(max_iter=400, solver='liblinear', multi_class='auto', class_weight='balanced')

# fit the model with data
model.fit(X_train,y_train)

In [None]:
Y_pred = np.abs(model.predict(X_test))
#print(Y_pred)
#Y_pred = np.asarray([np.argmax(row) for row in Y_pred])
Y_pred

In [None]:
y_test.values

<h3>LogisticRegression</h3>

In [None]:
print(metrics.accuracy_score(y_test.values, Y_pred))
results = [ y_test.values[i] == Y_pred[i] for i in range(len(Y_pred))]
print("Positive ",results.count(True) / len(results))
print("Negative ",results.count(False) / len(results))

In [None]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train,y_train)

In [None]:
Y_pred = clf.predict(X_test)

print(metrics.accuracy_score(y_test.values, Y_pred))
results = [ y_test.values[i] == Y_pred[i] for i in range(len(Y_pred))]
print("Positive ",results.count(True) / len(results))
print("Negative ",results.count(False) / len(results))

In [None]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)

In [None]:
y_pred = gnb.predict(X_test)

print(metrics.accuracy_score(y_test.values, Y_pred))
results = [ y_test.values[i] == Y_pred[i] for i in range(len(Y_pred))]
print("Positive ",results.count(True) / len(results))
print("Negative ",results.count(False) / len(results))

In [None]:
y_pred_final = model.predict(df_test)
submission = pd.DataFrame({'id_code' : test_id_code.values, 'target' : y_pred_final})
submission['target'] = submission['target'].astype('int64', copy=False)
submission['target'] = submission['target'].apply(lambda x: Predict_Volume[x])    
submission.head()

In [None]:
submission.to_csv("submission_v3.csv", index=False)

In [None]:
%reset