In [10]:
# Basic
import numpy as np
import pandas as pd
import time
import csv
from openpyxl import load_workbook

# sklearn
from sklearn import datasets
from sklearn import cluster
from sklearn.model_selection import train_test_split

# imbalanced learning
from imblearn.pipeline import Pipeline
from imblearn import under_sampling, over_sampling

# Import raw data and pre-process

In [11]:
lpmc = pd.read_csv("20200721 lpmc.csv", header=0,sep=',')


# Creating new variables
lpmc['dur_pt_inv'] = lpmc['dur_pt_rail'] + lpmc['dur_pt_bus']
lpmc['cost_driving_total'] = lpmc['cost_driving_fuel'] + lpmc['cost_driving_ccharge']
lpmc['male'] = 1-lpmc['female']
lpmc['dur_pt_int_total'] = lpmc['dur_pt_int']
lpmc['pt_n_interchanges'] = lpmc['pt_interchanges']

X = lpmc.drop(columns=['travel_mode'])
y = lpmc['travel_mode']

# Spliting training and testig sets 
# Rule 3: Survey-year based splitting with 1st and 2nd year as training and 3rd year as testing sets

X_train = lpmc[lpmc['survey_year']<=2].drop(columns=['travel_mode'])
y_train = lpmc[lpmc['survey_year']<=2]['travel_mode']
X_test = lpmc[lpmc['survey_year']==3].drop(columns=['travel_mode'])
y_test = lpmc[lpmc['survey_year']==3]['travel_mode']

# Output
# print('X_train')
# print(X_train)
# print('y_train') 
# print(y_train)    
# print('X_test')
# print(X_test)
# print('y_test') 
# print(y_test)    


# Creating new variables and extract variables

In [12]:
# Create variables

Identifiers = ['trip_id','household_id','person_n','trip_n']

Variables = ['age','male','driving_license','car_ownership','distance',\
             'dur_walking','dur_cycling', 'dur_pt_access','dur_pt_inv','dur_pt_int_total','pt_n_interchanges','dur_driving',\
             'cost_transit','cost_driving_total']

Identifiers.extend(Variables)
# print(Identifiers)

X_train= X_train[Identifiers]
X_test= X_test[Identifiers]

list_categorical_var = ['male','driving_license','car_ownership', 'pt_n_interchanges']
mask_cateogrical_var = [e in list_categorical_var for e in X_train.columns]
# print(mask_cateogrical_var)

# Output
# print('X_train')
# print(X_train)
# print('y_train') 
# print(y_train)    
# print('X_test')
# print(X_test)
# print('y_test') 
# print(y_test)   


# Resampling

1. Under-sampling<br/>
    1.1 Random majority under-sampling with replacement<br/>
    1.2 One-Sided Selection<br/>
    1.3 Neighboorhood Cleaning Rule <br/>
    1.4 Under-sampling with Cluster Centroids<br/>
2. Over-sampling<br/>
    2.1 Random minority over-sampling with replacement<br/>
    2.2 SMOTENC - SMOTE for Nominal and Continuous <br/>
    2.3 ADASYN - Adaptive synthetic sampling approach for imbalanced learning<br/>
3. Over-sampling followed by under-sampling<br/>
    SMOTE + Tomek links <br/>
    SMOTE + ENN <br/><br/>
4. Ensemble classifier using samplers internally
    

# Define the resampling method

In [13]:
methodrange = [1.1,1.2,1.3,2.1,2.2,2.3]

# Iteration for resampling method
for i in range(6): 
    resampling_method = methodrange[i]
    print("Resampling_method:" + str(resampling_method))
    random_seed_resample = 0
    proc_time_list = []
    
    # Iteration for generating 10 files for each resampling method
    for j in range(10): 
        random_seed_resample = random_seed_resample + 200
        # random_seed_resample = 1009
        print("Random_state:" + str(random_seed_resample))
        
        # Resampling
        if resampling_method == 1.1: # Random majority under-sampling with replacement
                resmplnm = '1.1_RandomUnderSampler'
                resmpl = under_sampling.RandomUnderSampler(random_state=random_seed_resample)
        elif resampling_method == 1.2:# One-Sided Selection
                resmplnm = '1.2_One-SidedSelection'
                resmpl = under_sampling.OneSidedSelection(random_state=random_seed_resample,n_neighbors=3, n_seeds_S=1000, n_jobs=-1)
        elif resampling_method == 1.3:# Neighboorhood Cleaning Rule
                resmplnm = '1.3_NeighbourhoodCleaningRule'
                resmpl = under_sampling.NeighbourhoodCleaningRule (n_jobs=-1)
        elif resampling_method == 2.1:# Random minority over-sampling with replacement
                resmplnm = '2.1_RandomOverSampler'
                resmpl = over_sampling.RandomOverSampler(random_state=random_seed_resample)
        elif resampling_method == 2.2:# SMOTENC - SMOTE for Nominal and Continuous
                resmplnm = '2.2_SMOTENC'
                resmpl = over_sampling.SMOTENC(random_state=random_seed_resample, categorical_features=mask_cateogrical_var,n_jobs=-1)
        elif resampling_method == 2.3:# ADASYN - Adaptive synthetic sampling approach for imbalanced learning
                resmplnm = '2.3_ADASYN'
                resmpl = over_sampling.ADASYN(random_state=random_seed_resample,n_jobs=-1)

        print(resmplnm + " starts.")
        start_time = time.time()

        X_train_resampled, y_train_resampled = resmpl.fit_resample(X_train, y_train)
        y_train_resampled = y_train_resampled.replace([1, 2, 3, 4], ['walk', 'cycle', 'pt', 'drive'])
                
    
        end_time = time.time()
        proc_time = end_time - start_time 
        proc_time_list.append(proc_time)
 
               
        # Write x_train into csv files
        bookname1 = resmplnm + '_' + str(j+1) + '_X_Train.csv'
        X_train_resampled.to_csv(bookname1, index=False)
        bookname2 = resmplnm + '_' + str(j+1) + '_Y_Train.csv'
        y_train_resampled.to_csv(bookname2, index=False)
        
        print("Dataset written.")
       

    
    proc_time_df=pd.DataFrame(proc_time_list,columns=[resmplnm])
    print("Processing time:")
    print(proc_time_df)
    
    # Write Processing time into Excel spreadsheet
    book = load_workbook('20210428 Resampling time.xlsx')
    writer = pd.ExcelWriter('20210428 Resampling time.xlsx', engine='openpyxl') 
    writer.book = book

    ## ExcelWriter for some reason uses writer.sheets to access the sheet.
    ## If you leave it empty it will not know that sheet Main is already there
    ## and will create a new sheet.

    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    proc_time_df.to_excel(writer, sheet_name = "Processing time", startcol = 1+i,index = False)

    writer.save()

    print("Writing to Excel Finished.")

Resampling_method:1.1
Random_state:200
1.1_RandomUnderSampler starts.
Dataset written.
Random_state:400
1.1_RandomUnderSampler starts.
Dataset written.
Random_state:600
1.1_RandomUnderSampler starts.
Dataset written.
Random_state:800
1.1_RandomUnderSampler starts.
Dataset written.
Random_state:1000
1.1_RandomUnderSampler starts.
Dataset written.
Random_state:1200
1.1_RandomUnderSampler starts.
Dataset written.
Random_state:1400
1.1_RandomUnderSampler starts.
Dataset written.
Random_state:1600
1.1_RandomUnderSampler starts.
Dataset written.
Random_state:1800
1.1_RandomUnderSampler starts.
Dataset written.
Random_state:2000
1.1_RandomUnderSampler starts.
Dataset written.
Processing time:
   1.1_RandomUnderSampler
0                0.030881
1                0.021941
2                0.028925
3                0.019947
4                0.024937
5                0.020914
6                0.026021
7                0.022979
8                0.025251
9                0.019162
Writing to Excel Fi

In [14]:
y_train = y_train.replace([1, 2, 3, 4], ['walk', 'cycle', 'pt', 'drive'])
y_test = y_test.replace([1, 2, 3, 4], ['walk', 'cycle', 'pt', 'drive'])
                
bookname1 = 'Original_X_Train.csv'
X_train.to_csv(bookname1, index=False)
bookname2 = 'Original_Y_Train.csv'
y_train.to_csv(bookname2, index=False)
bookname3 = 'Original_X_Test.csv'
X_test.to_csv(bookname3, index=False)
bookname4 = 'Original_Y_Test.csv'
y_test.to_csv(bookname4, index=False)

# Testing...

In [None]:
resampling_method = 1.4
random_seed_resample = 0

for j in range(10): 
    random_seed_resample = random_seed_resample + 200
    print("Random_state:" + str(random_seed_resample))

    if resampling_method == 1.2:# Condensed Nearest Neighbour
            resmplnm = '1.2 CondensedNearestNeighbour'
            resmpl = under_sampling.CondensedNearestNeighbour(random_state=random_seed_resample, n_neighbors=3, n_seeds_S=20000, n_jobs=-1)
    elif resampling_method == 1.3:# Instance Hardness Threshold
            resmplnm = '1.3 InstanceHardnessThreshold'
            resmpl = under_sampling.InstanceHardnessThreshold(random_state=random_seed_resample, n_jobs=-1)
    elif resampling_method == 2.3:# ADASYN - Adaptive synthetic sampling approach for imbalanced learning
            resmplnm = '2.3_ADASYN'
            resmpl = over_sampling.ADASYN(random_state=random_seed_resample,n_jobs=-1)
#             resmpl = over_sampling.ADASYN(random_state=random_seed_resample,sampling_strategy='minority',n_jobs=-1)
    elif resampling_method == 1.4:# Under-sampling with Cluster Centroids/Medoids
                resmplnm = '1.4_ClusterCentroids'
                resmpl = under_sampling.ClusterCentroids(random_state=random_seed_resample,estimator = cluster.DBSCAN(eps=3, min_samples=2)) # change estimator = cluster.DBSCAN
   
    print(resmplnm + " starts.")
    start_time = time.time()

    
    X_train_resampled, y_train_resampled = resmpl.fit_resample(X_train, y_train)
#     X_test_resampled, y_test_resampled = resmpl.fit_resample(X_test, y_test)
    y_train_resampled = y_train_resampled.replace([1, 2, 3, 4], ['walk', 'cycle', 'pt', 'drive'])
#     y_test_resampled = y_test_resampled.replace([1, 2, 3, 4], ['walk', 'cycle', 'pt', 'drive'])
#         print (y_train_resampled)
#         print (y_test_resampled)

    end_time = time.time()
    proc_time = end_time - start_time 
    proc_time_list.append(proc_time)
    # print (X_train_resampled.describe())

    # Write x_train into csv files
    bookname1 = resmplnm + '_X_Train_'+ str(j+1) +'.csv'
    X_train_resampled.to_csv(bookname1, index=False)
    bookname2 = resmplnm + '_Y_Train_'+ str(j+1) +'.csv'
    y_train_resampled.to_csv(bookname2, index=False)
#     bookname3 = resmplnm + '_X_Test_'+ str(j+1) +'.csv'
#     X_test_resampled.to_csv(bookname3, index=False)
#     bookname4 = resmplnm + '_Y_Test_'+ str(j+1) +'.csv'
#     y_test_resampled.to_csv(bookname4, index=False)

    print("Dataset written.")
      
proc_time_df = pd.DataFrame(proc_time_list,columns=[resmplnm])
print("Processing time:")
print(proc_time_df)
    
# Write Processing into Excel spreadsheet
book = load_workbook('20210428 Resampling time.xlsx')
writer = pd.ExcelWriter('20210428 Resampling time.xlsx', engine='openpyxl') 
writer.book = book

## ExcelWriter for some reason uses writer.sheets to access the sheet.
## If you leave it empty it will not know that sheet Main is already there
## and will create a new sheet.

writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

proc_time_df.to_excel(writer, sheet_name = "Processing time", startcol = 8,index = False)

writer.save()

print("Writing to Excel Finished.")