In [1]:
# Import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Read in Datasets

# Import datasets
delay = pd.read_csv('data/all_years.csv', delimiter='\t')
codes_srt = pd.read_csv('data/codes_srt.csv', delimiter='\t')
codes_sub = pd.read_csv('data/codes_sub.csv', delimiter='\t')

In [3]:
delay.head()

Unnamed: 0,Date,Time,Day,Station,Code,Min Delay,Min Gap,Bound,Line,Vehicle
0,2014-01-01,00:21,Wednesday,VICTORIA PARK STATION,MUPR1,55,60,W,BD,5111
1,2014-01-01,02:06,Wednesday,HIGH PARK STATION,SUDP,3,7,W,BD,5001
2,2014-01-01,02:40,Wednesday,SHEPPARD STATION,MUNCA,0,0,,YU,0
3,2014-01-01,03:10,Wednesday,LANSDOWNE STATION,SUDP,3,8,W,BD,5116
4,2014-01-01,03:20,Wednesday,BLOOR STATION,MUSAN,5,10,S,YU,5386


### Feature Engineering: Station


In [48]:
# Generate station Dictionary and List
stationDict =  {'BD': ['Bloor','St George','Spadina','Kipling','Islington','Royal York','Old Mill','Jane','Runnymede','High Park','Keele','Dundas West','Lansdowne','Dufferin','Ossington','Christie','Bathurst','Bay','Sherbourne','Castle Frank','Broadview','Chester','Pape','Donlands','Greenwood','Coxwell','Woodbine','Main Street','Victoria Park','Warden'],
               'YU': ['Finch','North York Centre','York Mills','Lawrence','Eglinton','Davisville','St Clair','Summerhill','Rosedale','Wellesley','College','Dundas','Queen','King','Union','St Andrew','Osgoode','St Patrick','Queens Park','Museum','Dupont','St Clair West','Eglinton West','Glencairn','Lawrence West','Yorkdale','Wilson','Sheppard West','Downsview Park','Finch West','York University','Pioneer Village','Highway 407','Vaughan Metro Centre','Bloor','St George','Spadina','Sheppard'],
               'SHP': ['Sheppard','Bayview','Bessarion','Leslie','Don Mills'],
               'SRT': ['Kennedy','Lawrence East','Ellesmere','Midland','Scarborough Centre','McCowan']}

stationList = []
for line in stationDict:
    for stat in stationDict[line]:
        stationList.append(stat.lower())
    
    
stationList = list(set(stationList))

In [49]:
# Identify Levenshtein Distance of Variables
# https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/

def lev(string1, string2):
    size_x = len(string1) + 1
    size_y = len(string2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = 0
    for y in range(size_y):
        matrix [0, y] = y
    
    for x in range(1, size_x):
        for y in range(1, size_y):
            if string1[x-1] == string2[y-1]:
                matrix [x, y] = min(
                matrix[x-1, y] + 1,
                matrix[x-1, y-1],
                matrix[x, y-1] + 1)
            else:
                matrix[x,y] = min(
                matrix[x-1,y-1] + 1,
                matrix[x, y-1] + 1)

    #print(matrix)
    return (matrix[size_x - 1, size_y-1])

In [132]:
# Create Function to Morph Station to Closes Variable in Station List

# Identify list of variables to replace
replaceList = ['-','_','.','-']

def morphName(string, stationList):
    
    for r in replaceList:
        string = string.replace(r,'')
    
    #1. Identify if 'Station' is in string
    val = string.lower().find('station')
    if val != -1:
        
        for station in stationList:
            if lev(string[:val-1].replace(' ','').lower(), station.replace(' ','').lower()) == 0:
                return station
            
    #2. Use Bag of Words method
    if val != -1:
        searchList = string[:val-1].split(' ')
    else:
        searchList = string.split(' ')
        
    for word in searchList:
        for station in stationList:
            if lev(word.lower(), station) == 0:
                return station
            
    #3. Use closest approximate
    if val != -1:
        search = string[:val-1].replace(' ','').lower()
        for station in stationList:
            
            target = station.replace(' ','')

            for i in reversed(range(3, len(search) + 1)):
                if len(search[:i]) <= len(target):
                    if lev(search[:i], target[:i]) == 0:
                        return station
                                   

In [133]:
# Generate Mapping Table for Station Names

data_station = []
real_station = []

# Cycle through station names to identify the closest match
for stat in delay['Station'].unique():
    data_station.append(stat)
    real_station.append(morphName(stat, stationList))
    
# Concatenare Series into Dataframe
stationNames = pd.concat([pd.Series(data_station, name='data_station'), 
                          pd.Series(real_station, name='real_station')], 
                          axis=1)

In [134]:
# Save Down Station Names
#stationNames.to_csv('data/station_names.csv')

Unnamed: 0,data_station,real_station
0,VICTORIA PARK STATION,victoria park
1,HIGH PARK STATION,high park
2,SHEPPARD STATION,sheppard
3,LANSDOWNE STATION,lansdowne
4,BLOOR STATION,bloor
5,DUFFERIN STATION,dufferin
6,NORTH YORK CTR STATION,north york centre
7,QUEEN STATION,queen
8,RUNNYMEDE STATION,runnymede
9,ST ANDREW STATION,st andrew
