# Python Regular Expressions

Often times, data sets need cleaning and looking through text strings.  Python provides built-in methods such as _index()_, _s.replace()_, _s.lower()_, and _split()_ (among many others) to provide the user as much versatility as possible.

However, often times an analyst needs to parse through strings with unique patterns and properties - this is where regular expressions come in.  Python's _re_ library provides the user a unique and simple way to use regular expressions within Python code to extract what they're looking for.  Furthermore, the result is often times much more interpretable than using Python's built-in functions.

In [1]:
import re
import pandas as pd
import random

In [3]:
data=pd.read_csv("citi_bike_subset1.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,0,2110,2017/8/1 0:00,2017/8/1 0:35,470,W 20 Street & 8 Avenue,40.743453,-74.00004,3289,W 90 St & Amsterdam Ave,40.790179,-73.972889,20954,Subscriber,1978.0,2
1,1,160,2017/8/1 0:00,2017/8/1 0:02,348,W Broadway & Spring St,40.72491,-74.001547,151,Cleveland Pl & Spring St,40.722104,-73.997249,15164,Subscriber,1978.0,1
2,2,1644,2017/8/1 0:00,2017/8/1 0:27,3165,Central Park West & W 72 St,40.775794,-73.976206,3320,Central Park West & W 100 St,40.793393,-73.963556,17540,Subscriber,1962.0,2
3,3,323,2017/8/1 0:00,2017/8/1 0:05,389,Broadway & Berry Street,40.710446,-73.965251,3073,Division Ave & Hooper St,40.706913,-73.954417,18705,Subscriber,1990.0,1
4,4,109,2017/8/1 0:00,2017/8/1 0:02,3145,E 84 Street & Park Avenue,40.778627,-73.957721,3147,E 85 St & 3 Ave,40.778012,-73.954071,27975,Subscriber,1983.0,1


In [3]:
data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
count,136.0,136.0,136,136,136.0,136,136.0,136.0,136.0,136,136.0,136.0,136.0,136,122.0,136.0
unique,,,14,44,,114,,,,102,,,,2,,
top,,,2017/8/1 0:09,2017/8/1 0:13,,6 Ave & Spring St,,,,Bialystoker Pl & Delancey St,,,,Subscriber,,
freq,,,16,14,,3,,,,3,,,,119,,
mean,67.5,981.382353,,,1433.838235,,40.735457,-73.982126,1253.411765,,40.734729,-73.981469,23280.845588,,1982.983607,1.080882
std,39.403892,1304.465033,,,1357.612088,,0.025848,0.017013,1323.087469,,0.026859,0.016384,5055.510034,,9.663755,0.531318
min,0.0,75.0,,,72.0,,40.657089,-74.015756,72.0,,40.671198,-74.008119,14592.0,,1952.0,0.0
25%,33.75,335.5,,,382.75,,40.720547,-73.992257,345.0,,40.718509,-73.992189,18670.75,,1977.25,1.0
50%,67.5,634.5,,,479.0,,40.732233,-73.987858,482.0,,40.735943,-73.98518,25084.5,,1985.0,1.0
75%,101.25,1249.0,,,3120.75,,40.757127,-73.970501,3082.25,,40.752555,-73.971688,27802.0,,1990.0,1.0


In [4]:
for i in range(len(data)):
    data.loc[i,'start station name']=data.loc[i,'start station name'].replace('St','Street')
    data.loc[i,'start station name']=data.loc[i,'start station name'].replace('Ave','Avenue')

In [5]:
data.loc[:,'start station name']

0                          W 20 Streetreet & 8 Avenuenue
1                             W Broadway & Spring Street
2                        Central Park West & W 72 Street
3                            Broadway & Berry Streetreet
4                       E 84 Streetreet & Park Avenuenue
5                             3 Streetreet & 3 Avenuenue
6                                 Hanson Pl & Ashland Pl
7                                W 47 Street & 10 Avenue
8                                 W 54 Street & 9 Avenue
9                                Vernon Blvd & 50 Avenue
10                              6 Avenue & Spring Street
11                                9 Avenue & W 45 Street
12                                Garfield Pl & 8 Avenue
13                            W 67 Streetreet & Broadway
14                         W 31 Streetreet & 7 Avenuenue
15                      E 85 Streetreet & York Avenuenue
16                                9 Avenue & W 45 Street
17                        6 Ave

In [6]:
data=pd.read_csv("/Users/heyunyu/Downloads/citi_bike_subset1.csv")

In [7]:
for i in range(len(data)):
    data.loc[i,'start station name']=re.sub(r'\bSt\b','Street',data.loc[i,'start station name'])
    data.loc[i,'start station name']=re.sub(r'\bAve\b','Avenue',data.loc[i,'start station name'])

In [8]:
data.loc[:,'start station name']

0                                 W 20 Street & 8 Avenue
1                             W Broadway & Spring Street
2                        Central Park West & W 72 Street
3                                Broadway & Berry Street
4                              E 84 Street & Park Avenue
5                                    3 Street & 3 Avenue
6                                 Hanson Pl & Ashland Pl
7                                W 47 Street & 10 Avenue
8                                 W 54 Street & 9 Avenue
9                                Vernon Blvd & 50 Avenue
10                              6 Avenue & Spring Street
11                                9 Avenue & W 45 Street
12                                Garfield Pl & 8 Avenue
13                                W 67 Street & Broadway
14                                W 31 Street & 7 Avenue
15                             E 85 Street & York Avenue
16                                9 Avenue & W 45 Street
17                             

In [9]:
pattern=r'\bW\s[3|4]\d\b'
for i in range(len(data)):
    dd=re.compile(pattern,re.S).findall(data.loc[i,'start station name'])
    loc=int(i)
    if dd!=[]:
        print(dd,loc)

['W 47'] 7
['W 45'] 11
['W 31'] 14
['W 45'] 16
['W 31'] 26
['W 37'] 32
['W 36'] 43
['W 32'] 58
['W 41'] 62
['W 41'] 63
['W 49'] 70
['W 49'] 78
['W 32'] 90
['W 41'] 91
['W 45'] 93
['W 41'] 125
['W 31'] 132


find 30-40 street and data indice

In [10]:
data.index

RangeIndex(start=0, stop=136, step=1)

In [12]:
a=data['start station name'].apply(lambda row: re.compile(pattern,re.S).findall(row) if re.compile(pattern,re.S).findall(row)!=[] else None)
a

0        None
1        None
2        None
3        None
4        None
5        None
6        None
7      [W 47]
8        None
9        None
10       None
11     [W 45]
12       None
13       None
14     [W 31]
15       None
16     [W 45]
17       None
18       None
19       None
20       None
21       None
22       None
23       None
24       None
25       None
26     [W 31]
27       None
28       None
29       None
        ...  
106      None
107      None
108      None
109      None
110      None
111      None
112      None
113      None
114      None
115      None
116      None
117      None
118      None
119      None
120      None
121      None
122      None
123      None
124      None
125    [W 41]
126      None
127      None
128      None
129      None
130      None
131      None
132    [W 31]
133      None
134      None
135      None
Name: start station name, Length: 136, dtype: object

In [13]:
a=pd.DataFrame(a)

In [14]:
a.loc[a['start station name'].notnull(),'start station name']

7      [W 47]
11     [W 45]
14     [W 31]
16     [W 45]
26     [W 31]
32     [W 37]
43     [W 36]
58     [W 32]
62     [W 41]
63     [W 41]
70     [W 49]
78     [W 49]
90     [W 32]
91     [W 41]
93     [W 45]
125    [W 41]
132    [W 31]
Name: start station name, dtype: object

In [15]:
data['start station name']=data.apply(lambda row: re.sub(r'\bSt\b','Street',row['start station name']),axis=1)

In [16]:
data['start station name']=data.apply(lambda row: re.sub(r'\bAve\b','Avenue',row['start station name']),axis=1)

In [17]:
data

Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,0,2110,2017/8/1 0:00,2017/8/1 0:35,470,W 20 Street & 8 Avenue,40.743453,-74.000040,3289,W 90 St & Amsterdam Ave,40.790179,-73.972889,20954,Subscriber,1978.0,2
1,1,160,2017/8/1 0:00,2017/8/1 0:02,348,W Broadway & Spring Street,40.724910,-74.001547,151,Cleveland Pl & Spring St,40.722104,-73.997249,15164,Subscriber,1978.0,1
2,2,1644,2017/8/1 0:00,2017/8/1 0:27,3165,Central Park West & W 72 Street,40.775794,-73.976206,3320,Central Park West & W 100 St,40.793393,-73.963556,17540,Subscriber,1962.0,2
3,3,323,2017/8/1 0:00,2017/8/1 0:05,389,Broadway & Berry Street,40.710446,-73.965251,3073,Division Ave & Hooper St,40.706913,-73.954417,18705,Subscriber,1990.0,1
4,4,109,2017/8/1 0:00,2017/8/1 0:02,3145,E 84 Street & Park Avenue,40.778627,-73.957721,3147,E 85 St & 3 Ave,40.778012,-73.954071,27975,Subscriber,1983.0,1
5,5,525,2017/8/1 0:00,2017/8/1 0:09,3373,3 Street & 3 Avenue,40.675070,-73.987752,3339,Berkeley Pl & 6 Ave,40.676530,-73.978469,14816,Subscriber,1983.0,2
6,6,317,2017/8/1 0:01,2017/8/1 0:06,3429,Hanson Pl & Ashland Pl,40.685068,-73.977908,3421,Hoyt St & Warren St,40.684355,-73.989016,27797,Subscriber,1967.0,1
7,7,201,2017/8/1 0:01,2017/8/1 0:04,495,W 47 Street & 10 Avenue,40.762699,-73.993012,449,W 52 St & 9 Ave,40.764618,-73.987895,28611,Subscriber,1986.0,1
8,8,176,2017/8/1 0:01,2017/8/1 0:04,423,W 54 Street & 9 Avenue,40.765849,-73.986905,449,W 52 St & 9 Ave,40.764618,-73.987895,19956,Subscriber,1972.0,1
9,9,406,2017/8/1 0:01,2017/8/1 0:07,3119,Vernon Blvd & 50 Avenue,40.742327,-73.954117,3117,Franklin St & Dupont St,40.735640,-73.958660,26437,Subscriber,1989.0,1
