## Date Extracter

In [184]:
import pandas as pd

#### extracting lines

In [185]:
lines = []

In [186]:
with open('dates.txt') as file:
    for line in file:
        line = line.rstrip()                    #rstrip() strips the string's trailing characters, default spaces
        lines.append(line)

In [187]:
lines[:5]

['03/25/93 Total time of visit (in minutes):',
 '6/18/85 Primary Care Doctor:',
 'sshe plans to move as of 7/8/71 In-Home Services: None',
 '7 on 9/27/75 Audit C Score Current:',
 '2/6/96 sleep studyPain Treatment Pain Level (Numeric Scale): 7']

### making series

In [188]:
ser = pd.Series(lines, name='Text')

In [189]:
ser.head()

0           03/25/93 Total time of visit (in minutes):
1                         6/18/85 Primary Care Doctor:
2    sshe plans to move as of 7/8/71 In-Home Servic...
3                  7 on 9/27/75 Audit C Score Current:
4    2/6/96 sleep studyPain Treatment Pain Level (N...
Name: Text, dtype: object

### converting series to dataframe

In [190]:
frame = pd.DataFrame(ser)

In [191]:
frame.head()

Unnamed: 0,Text
0,03/25/93 Total time of visit (in minutes):
1,6/18/85 Primary Care Doctor:
2,sshe plans to move as of 7/8/71 In-Home Servic...
3,7 on 9/27/75 Audit C Score Current:
4,2/6/96 sleep studyPain Treatment Pain Level (N...


### extracting first few dates

In [192]:
import re

In [193]:
frame['Dates1'] = frame['Text']

In [194]:
frame['Dates1'].head()

0           03/25/93 Total time of visit (in minutes):
1                         6/18/85 Primary Care Doctor:
2    sshe plans to move as of 7/8/71 In-Home Servic...
3                  7 on 9/27/75 Audit C Score Current:
4    2/6/96 sleep studyPain Treatment Pain Level (N...
Name: Dates1, dtype: object

#### first few dates

In [195]:
def extractor1(x):
    dt = re.search("\d{1,2}[/-]\d{1,2}[/-]\d{2,4}", x)
    if(dt != None):
        return dt.group(0)
    else:
        return None

In [196]:
frame['Dates1']=frame['Dates1'].apply(extractor1)

In [197]:
frame.head()

Unnamed: 0,Text,Dates1
0,03/25/93 Total time of visit (in minutes):,03/25/93
1,6/18/85 Primary Care Doctor:,6/18/85
2,sshe plans to move as of 7/8/71 In-Home Servic...,7/8/71
3,7 on 9/27/75 Audit C Score Current:,9/27/75
4,2/6/96 sleep studyPain Treatment Pain Level (N...,2/6/96


In [198]:
frame['Dates1'].count()

125

In [199]:
frame[125:200].head()

Unnamed: 0,Text,Dates1
125,s The patient is a 44 year old married Caucasi...,
126,.10 Sep 2004 - Intake at EEC for IOP but did n...,
127,see above and APS eval of 26 May 1982 Social H...,
128,Tbooked for intake appointment at Sierra Vista...,
129,06 May 1972 SOS-10 Total Score:,


#### extracting more dates after 124th row

In [200]:
def extractor2(x):
    dt2 = re.search("\d{2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}", x)
    if(dt2 != None):
        return dt2.group(0)
    else:
        return None

In [201]:
frame['Dates2'] = frame['Text']

In [202]:
frame['Dates2']=frame['Dates2'].apply(extractor2)

In [203]:
#frame[125:200]
frame['Dates2'].count()

69

In [204]:
frame[193:210]

Unnamed: 0,Text,Dates1,Dates2
193,"s 22 year old single Caucasian/Latino woman, u...",,18 Jan 1995
194,"April 11, 1990 CPT Code: 90791: No medical ser...",,
195,"MRI May 30, 2001 empty sella but no problems w...",,
196,".Feb 18, 1994: made a phone call to Mom and Mo...",,
197,"Brother died February 18, 1981 Parental/Caregi...",,
198,none; but currently has appt with new HJH PCP ...,,
199,".Came back to US on Jan 24 1986, saw Dr. Quack...",,
200,"July 26, 1978 Total time of visit (in minutes):",,
201,father was depressed inpatient at DFC December...,,
202,"May 15, 1989 SOS-10 Total Score:",,


#### extracting after the 193rd row

In [205]:
def extractor3(x):
    dt3 = re.search("(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{2},* \d{4}", x)
    if(dt3 != None):
        return dt3.group(0)
    else:
        return None

In [206]:
frame['Dates3'] = frame['Text']

In [207]:
frame['Dates3']=frame['Dates3'].apply(extractor3)

In [208]:
frame['Dates3'].count()

30

In [209]:
frame['Text'][229]

't Allergies Sulfa (Sulfonamide Antibiotics) - Renal Toxicity : pt developed acute interstitial nephritis on Bactrim (June 2011)'

In [210]:
frame[227:250].head()

Unnamed: 0,Text,Dates1,Dates2,Dates3
227,"September. 15, 2011 Total time of visit (in mi...",,,
228,"s 20 yo M carries dx of BPAD, presents for psy...",,,
229,t Allergies Sulfa (Sulfonamide Antibiotics) - ...,,,
230,B/R Walnut Ridge. Raised with sister and paren...,,,
231,50 yo DWF with a history of alcohol use disord...,,,


#### after 226th row

In [211]:
def extractor4(x):
    dt4 = re.search("(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}", x)
    if(dt4 != None):
        return dt4.group(0)
    else:
        return None

In [212]:
frame['Dates4']=frame['Text']

In [213]:
frame['Dates4'] = frame['Dates4'].apply(extractor4)

In [214]:
frame['Dates4'].count()

171

In [215]:
frame[328:350]

Unnamed: 0,Text,Dates1,Dates2,Dates3,Dates4
328,s Pt reports long Hx of drug addiction. PLEASE...,,,,
329,"CLLC and DCF - ended in March, 2000. Psychiatr...",,,,
330,"""-worked with Uma Dewitt (ED specialist) x 10 ...",,,,April 1988
331,Prozac- highest dose 40 mg; currently on 20 mg...,,,,December 1993
332,)last use in June 1974.Longitudinal Alcohol us...,,,,June 1974
333,sNovember 1997 - suicidal ideation - HHR,,,,November 1997
334,"sAppendectomy in July 1986, shortly after deli...",,,,July 1986
335,"Open appendectomy, as a child. 2. Laparoscopi...",,,,February 1973
336,"sOcella 0.03-3MG TABLET Take 1 PO QD, next pap...",,,,March 1978
337,sWas on lexapro and strattera with good result...,,,,Dec 2007


#### few more dates

In [216]:
def extractor5(x):
    dt5 = re.search("\d{1,2}[/-]\d{4}", x)
    if(dt5 != None):
        return dt5.group(0)
    else:
        return None

In [217]:
frame['Dates5'] = frame['Text']

In [218]:
frame['Dates5'] = frame['Dates5'].apply(extractor5)

In [219]:
#frame['Text'][456]

#### extractor 6

In [220]:
def extractor6(x):
    dt6 = re.search("\d{4}", x)
    if(dt6 != None):
        return dt6.group(0)
    else:
        return None

In [221]:
frame['Dates6'] = frame['Text']

In [222]:
frame['Dates6'] = frame['Dates6'].apply(extractor6)

In [223]:
frame['Dates6'][455:460]

455    1984
456    2000
457    2001
458    1982
459    1998
Name: Dates6, dtype: object

In [226]:
frame.shape

(500, 7)

In [227]:
frame[460:500]

Unnamed: 0,Text,Dates1,Dates2,Dates3,Dates4,Dates5,Dates6
460,1 Ex-smoker : quit 2012,,,,,,2012
461,". Age 16, 1991, frontal impact. out for two we...",,,,,,1991
462,sLexapro (1988-now): Good response (anxiety),,,,,,1988
463,s 25 year old engaged to be married Optic...,,,,,,2014
464,8Complications from brain hemmorage in 2016 Ax...,,,,,,2016
465,".Age, 19, 1976, playing football, frontal impa...",,,,,,1976
466,s1981 Swedish-American Hospital,,,,,,1981
467,aS/P suicide attempt 2011 Hx of Outpatient Tre...,,,,,,2011
468,Patient has a history of suicidal ideation wit...,,,,,,1997
469,"Born and raised in Fowlerville, IN. Parents d...",,,,,,2003


In [230]:
frame.columns

Index(['Text', 'Dates1', 'Dates2', 'Dates3', 'Dates4', 'Dates5', 'Dates6'], dtype='object')

In [232]:
#frame['Date'] = frame[['Dates1', 'Dates2', 'Dates3', 'Dates4', 'Dates5', 'Dates6']].apply(lambda x: ''.join(x), axis=1)

## Concatenating all columns

In [233]:
new_frame = frame.copy()

In [234]:
new_frame.head()

Unnamed: 0,Text,Dates1,Dates2,Dates3,Dates4,Dates5,Dates6
0,03/25/93 Total time of visit (in minutes):,03/25/93,,,,,
1,6/18/85 Primary Care Doctor:,6/18/85,,,,,
2,sshe plans to move as of 7/8/71 In-Home Servic...,7/8/71,,,,,
3,7 on 9/27/75 Audit C Score Current:,9/27/75,,,,,
4,2/6/96 sleep studyPain Treatment Pain Level (N...,2/6/96,,,,,


In [235]:
sep_date = pd.concat([new_frame['Dates1'], new_frame['Dates2'], new_frame['Dates3'], new_frame['Dates4'], new_frame['Dates5'], new_frame['Dates6']], axis=1, join='outer')

In [237]:
sep_date.count()

Dates1    125
Dates2     69
Dates3     30
Dates4    171
Dates5    138
Dates6    428
dtype: int64

In [238]:
new_frame['Date'] = ('' if new_frame['Dates1'].empty else new_frame['Dates1'].map(str)) + ('' if new_frame['Dates2'].empty else new_frame['Dates2'].map(str))+ ('' if new_frame['Dates3'].empty else new_frame['Dates3'].map(str))+ ('' if new_frame['Dates4'].empty else new_frame['Dates4'].map(str))+ ('' if new_frame['Dates5'].empty else new_frame['Dates5'].map(str))+ ('' if new_frame['Dates6'].empty else new_frame['Dates6'].map(str))

In [240]:
new_frame['Date'].tail()

495    NoneNoneNoneNoneNone1979
496    NoneNoneNoneNoneNone2006
497    NoneNoneNoneNoneNone2008
498    NoneNoneNoneNoneNone2005
499    NoneNoneNoneNoneNone1980
Name: Date, dtype: object